diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp index 15df6216f89a4..a41df9606749f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -178,6 +178,20 @@ bool AMDGPUAtomicOptimizerImpl::run(Function &F) { return Changed; } +static bool isLegalCrossLaneType(Type *Ty) { + switch (Ty->getTypeID()) { + case Type::FloatTyID: + case Type::DoubleTyID: + return true; + case Type::IntegerTyID: { + unsigned Size = Ty->getIntegerBitWidth(); + return (Size == 32 || Size == 64); + } + default: + return false; + } +} + void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) { // Early exit for unhandled address space atomic instructions. switch (I.getPointerAddressSpace()) { @@ -228,11 +242,14 @@ void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) { // If the value operand is divergent, each lane is contributing a different // value to the atomic calculation. We can only optimize divergent values if - // we have DPP available on our subtarget, and the atomic operation is 32 - // bits. - if (ValDivergent && - (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) { - return; + // we have DPP available on our subtarget (for DPP strategy), and the atomic + // operation is 32 or 64 bits. + if (ValDivergent) { + if (ScanImpl == ScanOptions::DPP && !ST->hasDPP()) + return; + + if (!isLegalCrossLaneType(I.getType())) + return; } // If we get here, we can optimize the atomic using a single wavefront-wide @@ -311,11 +328,14 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) { // If the value operand is divergent, each lane is contributing a different // value to the atomic calculation. We can only optimize divergent values if - // we have DPP available on our subtarget, and the atomic operation is 32 - // bits. - if (ValDivergent && - (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) { - return; + // we have DPP available on our subtarget (for DPP strategy), and the atomic + // operation is 32 or 64 bits. + if (ValDivergent) { + if (ScanImpl == ScanOptions::DPP && !ST->hasDPP()) + return; + + if (!isLegalCrossLaneType(I.getType())) + return; } // If any of the other arguments to the intrinsic are divergent, we can't @@ -748,7 +768,6 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I, // of each active lane in the wavefront. This will be our new value // which we will provide to the atomic operation. Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); - assert(TyBitWidth == 32); NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane, {NewV, LastLaneIdx}); } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll index b058ad1023e13..b54aec935bd5f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll @@ -1,249 +1,1247 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefixes=GFX90A,GFX90A_ITERATIVE %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefixes=GFX90A,GFX90A_DPP %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefixes=GFX940,GFX940_ITERATIVE %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefixes=GFX940,GFX940_DPP %s define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: S_ENDPGM 0 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret void } define amdgpu_ps double @global_atomic_fadd_f64_rtn_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX90A-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX90A-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX90A-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret double %ret } define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: S_ENDPGM 0 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret void } define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX90A-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX90A-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX90A-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 + ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret double %ret } define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_flat_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: S_ENDPGM 0 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret void } define amdgpu_ps double @global_atomic_fadd_f64_rtn_flat_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX90A-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX90A-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX90A-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret double %ret } define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: S_ENDPGM 0 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret void } define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX90A-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX90A-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX90A-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 + ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret double %ret } define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_atomicrmw(ptr addrspace(1) %ptr, double %data) #0 { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: S_ENDPGM 0 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic ret void } define amdgpu_ps double @global_atomic_fadd_f64_rtn_atomicrmw(ptr addrspace(1) %ptr, double %data) #0 { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX90A-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX90A-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX90A-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic ret double %ret } define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, double %data) #0 { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw + ; GFX90A_ITERATIVE: bb.1 (%ir-block.0): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.6(0x40000000) + ; GFX90A_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE + ; GFX90A_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.2 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.2 (%ir-block.5): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.7(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.7 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.3 (%ir-block.7): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.4(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A_ITERATIVE-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], %25, [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.4.Flow: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.6(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: SI_END_CF %35, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.6 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.5 (%ir-block.9): + ; GFX90A_ITERATIVE-NEXT: S_ENDPGM 0 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.6.Flow1: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.5(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.5 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.7.ComputeLoop: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.8(0x04000000), %bb.7(0x7c000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI %17, %bb.7, [[S_MOV_B]], %bb.2 + ; GFX90A_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI %22, %bb.7, [[COPY4]], %bb.2 + ; GFX90A_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_FFBL_B32_e64_:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY5]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_FFBL_B32_e64_1:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY6]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 32, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_FFBL_B32_e64_1]], [[V_MOV_B32_e32_1]], 0, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_MIN_U32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U32_e64 [[V_FFBL_B32_e64_]], [[V_ADD_U32_e64_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY2]], [[V_READFIRSTLANE_B32_]] + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY3]], [[V_READFIRSTLANE_B32_1]] + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI]], 0, [[COPY7]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1 + ; GFX90A_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B1]] + ; GFX90A_ITERATIVE-NEXT: [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[V_MIN_U32_e64_]], [[COPY8]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_NOT_B32_e32_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY9]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_NOT_B32_e32_1:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY10]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY11]], [[V_NOT_B32_e32_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY12]], [[V_NOT_B32_e32_1]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0 + ; GFX90A_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B2]] + ; GFX90A_ITERATIVE-NEXT: [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U64_e64 [[REG_SEQUENCE2]], [[COPY13]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: $vcc = COPY [[V_CMP_NE_U64_e64_]] + ; GFX90A_ITERATIVE-NEXT: S_CBRANCH_VCCNZ %bb.7, implicit $vcc + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.8 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.8.ComputeEnd: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_]], %bb.7 + ; GFX90A_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY16:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY16]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY17:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE3]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_ITERATIVE-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY14]] + ; GFX90A_ITERATIVE-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY18]], [[COPY19]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY17]] + ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY20]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY21]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.3 + ; + ; GFX90A_DPP-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw + ; GFX90A_DPP: bb.1 (%ir-block.0): + ; GFX90A_DPP-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) + ; GFX90A_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_DPP-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_DPP-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE + ; GFX90A_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_DPP-NEXT: S_BRANCH %bb.2 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.2 (%ir-block.5): + ; GFX90A_DPP-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX90A_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX90A_DPP-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX90A_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1 + ; GFX90A_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0 + ; GFX90A_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_DPP-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX90A_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY9]], [[COPY10]], implicit $exec + ; GFX90A_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] + ; GFX90A_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY11]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX90A_DPP-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX90A_DPP-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[REG_SEQUENCE1]], [[COPY12]], implicit-def dead $scc, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY13]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, [[V_MOV_B]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY14]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY15:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY15]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY16:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY16]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY17:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY17]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY18:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY18]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX90A_DPP-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0 + ; GFX90A_DPP-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1 + ; GFX90A_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY19]], [[S_MOV_B32_2]] + ; GFX90A_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY20]], [[S_MOV_B32_2]] + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[COPY21:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] + ; GFX90A_DPP-NEXT: [[STRICT_WWM:%[0-9]+]]:vreg_64_align2 = STRICT_WWM [[COPY21]], implicit $exec + ; GFX90A_DPP-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY22]], implicit $exec + ; GFX90A_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_DPP-NEXT: S_BRANCH %bb.3 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.3 (%ir-block.31): + ; GFX90A_DPP-NEXT: successors: %bb.4(0x80000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A_DPP-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.4.Flow: + ; GFX90A_DPP-NEXT: successors: %bb.5(0x80000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.5 (%ir-block.33): + ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_DPP-NEXT: S_ENDPGM 0 + ; + ; GFX940_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw + ; GFX940_ITERATIVE: bb.1 (%ir-block.0): + ; GFX940_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.6(0x40000000) + ; GFX940_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX940_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE + ; GFX940_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.2 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.2 (%ir-block.5): + ; GFX940_ITERATIVE-NEXT: successors: %bb.7(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.7 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.3 (%ir-block.7): + ; GFX940_ITERATIVE-NEXT: successors: %bb.4(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940_ITERATIVE-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], %24, [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.4.Flow: + ; GFX940_ITERATIVE-NEXT: successors: %bb.6(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: SI_END_CF %34, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.6 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.5 (%ir-block.9): + ; GFX940_ITERATIVE-NEXT: S_ENDPGM 0 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.6.Flow1: + ; GFX940_ITERATIVE-NEXT: successors: %bb.5(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.5 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.7.ComputeLoop: + ; GFX940_ITERATIVE-NEXT: successors: %bb.8(0x04000000), %bb.7(0x7c000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI %16, %bb.7, [[S_MOV_B]], %bb.2 + ; GFX940_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI %21, %bb.7, [[COPY4]], %bb.2 + ; GFX940_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1 + ; GFX940_ITERATIVE-NEXT: [[V_FFBL_B32_e64_:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY5]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_FFBL_B32_e64_1:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY6]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 32, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_FFBL_B32_e64_1]], [[V_MOV_B32_e32_1]], 0, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_MIN_U32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U32_e64 [[V_FFBL_B32_e64_]], [[V_ADD_U32_e64_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY2]], [[V_READFIRSTLANE_B32_]] + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY3]], [[V_READFIRSTLANE_B32_1]] + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI]], 0, [[COPY7]], 0, 0, implicit $mode, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1 + ; GFX940_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B1]] + ; GFX940_ITERATIVE-NEXT: [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[V_MIN_U32_e64_]], [[COPY8]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub1 + ; GFX940_ITERATIVE-NEXT: [[V_NOT_B32_e32_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY9]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_NOT_B32_e32_1:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY10]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1 + ; GFX940_ITERATIVE-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY11]], [[V_NOT_B32_e32_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY12]], [[V_NOT_B32_e32_1]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0 + ; GFX940_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B2]] + ; GFX940_ITERATIVE-NEXT: [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U64_e64 [[REG_SEQUENCE2]], [[COPY13]], implicit $exec + ; GFX940_ITERATIVE-NEXT: $vcc = COPY [[V_CMP_NE_U64_e64_]] + ; GFX940_ITERATIVE-NEXT: S_CBRANCH_VCCNZ %bb.7, implicit $vcc + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.8 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.8.ComputeEnd: + ; GFX940_ITERATIVE-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_]], %bb.7 + ; GFX940_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY16:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1 + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY16]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY17:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE3]].sub0 + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940_ITERATIVE-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY14]] + ; GFX940_ITERATIVE-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY18]], [[COPY19]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY17]] + ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY20]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY21]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.3 + ; + ; GFX940_DPP-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw + ; GFX940_DPP: bb.1 (%ir-block.0): + ; GFX940_DPP-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) + ; GFX940_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX940_DPP-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX940_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940_DPP-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE + ; GFX940_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_DPP-NEXT: S_BRANCH %bb.2 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.2 (%ir-block.5): + ; GFX940_DPP-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX940_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX940_DPP-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX940_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1 + ; GFX940_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0 + ; GFX940_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940_DPP-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX940_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY9]], [[COPY10]], implicit $exec + ; GFX940_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] + ; GFX940_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY11]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX940_DPP-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX940_DPP-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[REG_SEQUENCE1]], [[COPY12]], implicit-def dead $scc, implicit $exec + ; GFX940_DPP-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY13]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, [[V_MOV_B]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY14]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY15:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY15]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY16:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY16]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY17:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY17]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY18:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY18]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX940_DPP-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0 + ; GFX940_DPP-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1 + ; GFX940_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY19]], [[S_MOV_B32_2]] + ; GFX940_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY20]], [[S_MOV_B32_2]] + ; GFX940_DPP-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[COPY21:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] + ; GFX940_DPP-NEXT: [[STRICT_WWM:%[0-9]+]]:vreg_64_align2 = STRICT_WWM [[COPY21]], implicit $exec + ; GFX940_DPP-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY22]], implicit $exec + ; GFX940_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_DPP-NEXT: S_BRANCH %bb.3 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.3 (%ir-block.31): + ; GFX940_DPP-NEXT: successors: %bb.4(0x80000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940_DPP-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.4.Flow: + ; GFX940_DPP-NEXT: successors: %bb.5(0x80000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.5 (%ir-block.33): + ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_DPP-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic ret void } define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, double %data) #0 { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; GFX90A_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw + ; GFX90A_ITERATIVE: bb.1 (%ir-block.0): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.6(0x40000000) + ; GFX90A_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_ITERATIVE-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE + ; GFX90A_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.2 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.2 (%ir-block.5): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.7(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.7 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.3 (%ir-block.7): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.4(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], %28, [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.4 (%ir-block.9): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.6(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.3, [[DEF]], %bb.8 + ; GFX90A_ITERATIVE-NEXT: SI_END_CF %38, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[COPY7]], 0, %27, 0, 0, implicit $mode, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX90A_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY10]], 0, [[COPY8]], %36, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY11]], 0, [[COPY9]], %36, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_CNDMASK_B32_e64_]], %subreg.sub0, [[V_CNDMASK_B32_e64_1]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.6 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.5 (%ir-block.14): + ; GFX90A_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY %44.sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY %44.sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]] + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_3]] + ; GFX90A_ITERATIVE-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.6.Flow: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.5(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[REG_SEQUENCE2]], %bb.4, [[DEF]], %bb.1 + ; GFX90A_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.5 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.7.ComputeLoop: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.8(0x04000000), %bb.7(0x7c000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI %19, %bb.7, [[S_MOV_B]], %bb.2 + ; GFX90A_ITERATIVE-NEXT: [[PHI3:%[0-9]+]]:vreg_64_align2 = PHI %18, %bb.7, [[DEF]], %bb.2 + ; GFX90A_ITERATIVE-NEXT: [[PHI4:%[0-9]+]]:vreg_64_align2 = PHI %24, %bb.7, [[COPY4]], %bb.2 + ; GFX90A_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_FFBL_B32_e64_:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY14]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_FFBL_B32_e64_1:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY15]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 32, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_FFBL_B32_e64_1]], [[V_MOV_B32_e32_1]], 0, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_MIN_U32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U32_e64 [[V_FFBL_B32_e64_]], [[V_ADD_U32_e64_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY2]], [[V_READFIRSTLANE_B32_4]] + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY3]], [[V_READFIRSTLANE_B32_5]] + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY16]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: $m0 = COPY [[V_READFIRSTLANE_B32_7]] + ; GFX90A_ITERATIVE-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_6]], $m0, [[COPY18]] + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY17]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: $m0 = COPY [[V_READFIRSTLANE_B32_9]] + ; GFX90A_ITERATIVE-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_8]], $m0, [[COPY19]] + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_WRITELANE_B32_]], %subreg.sub0, [[V_WRITELANE_B32_1]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY20:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] + ; GFX90A_ITERATIVE-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI2]], 0, [[COPY20]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1 + ; GFX90A_ITERATIVE-NEXT: [[COPY21:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B1]] + ; GFX90A_ITERATIVE-NEXT: [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[V_MIN_U32_e64_]], [[COPY21]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_NOT_B32_e32_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY22]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_NOT_B32_e32_1:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY23]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY24]], [[V_NOT_B32_e32_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY25]], [[V_NOT_B32_e32_1]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0 + ; GFX90A_ITERATIVE-NEXT: [[COPY26:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B2]] + ; GFX90A_ITERATIVE-NEXT: [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U64_e64 [[REG_SEQUENCE5]], [[COPY26]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: $vcc = COPY [[V_CMP_NE_U64_e64_]] + ; GFX90A_ITERATIVE-NEXT: S_CBRANCH_VCCNZ %bb.7, implicit $vcc + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.8 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.8.ComputeEnd: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[PHI5:%[0-9]+]]:vreg_64_align2 = PHI [[REG_SEQUENCE4]], %bb.7 + ; GFX90A_ITERATIVE-NEXT: [[PHI6:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_1]], %bb.7 + ; GFX90A_ITERATIVE-NEXT: [[COPY27:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY28:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY29:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY29]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY30:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE6]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_ITERATIVE-NEXT: [[COPY31:%[0-9]+]]:vgpr_32 = COPY [[COPY27]] + ; GFX90A_ITERATIVE-NEXT: [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY31]], [[COPY32]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY33:%[0-9]+]]:vgpr_32 = COPY [[COPY30]] + ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY33]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY34:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY34]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.3 + ; + ; GFX90A_DPP-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw + ; GFX90A_DPP: bb.1 (%ir-block.0): + ; GFX90A_DPP-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; GFX90A_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_DPP-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_DPP-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX90A_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE + ; GFX90A_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_DPP-NEXT: S_BRANCH %bb.2 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.2 (%ir-block.5): + ; GFX90A_DPP-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX90A_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX90A_DPP-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX90A_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1 + ; GFX90A_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0 + ; GFX90A_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_DPP-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX90A_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY9]], [[COPY10]], implicit $exec + ; GFX90A_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] + ; GFX90A_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY11]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX90A_DPP-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX90A_DPP-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[REG_SEQUENCE1]], [[COPY12]], implicit-def dead $scc, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY13]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, [[V_MOV_B]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY14]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY15:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY15]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY16:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY16]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY17:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY17]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY18:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY18]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY19:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY19]], [[V_ADD_F64_e64_5]], 312, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX90A_DPP-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0 + ; GFX90A_DPP-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1 + ; GFX90A_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY20]], [[S_MOV_B32_2]] + ; GFX90A_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY21]], [[S_MOV_B32_2]] + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[COPY22:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] + ; GFX90A_DPP-NEXT: [[STRICT_WWM:%[0-9]+]]:vreg_64_align2 = STRICT_WWM [[COPY22]], implicit $exec + ; GFX90A_DPP-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY23]], implicit $exec + ; GFX90A_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_DPP-NEXT: S_BRANCH %bb.3 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.3 (%ir-block.32): + ; GFX90A_DPP-NEXT: successors: %bb.5(0x80000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A_DPP-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A_DPP-NEXT: S_BRANCH %bb.5 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.4.Flow: + ; GFX90A_DPP-NEXT: successors: %bb.6(0x80000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI %45, %bb.5, [[DEF]], %bb.1 + ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_DPP-NEXT: S_BRANCH %bb.6 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.5 (%ir-block.35): + ; GFX90A_DPP-NEXT: successors: %bb.4(0x80000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.3, [[DEF]], %bb.2 + ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0 + ; GFX90A_DPP-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1 + ; GFX90A_DPP-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY24]], implicit $exec + ; GFX90A_DPP-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY25]], implicit $exec + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[STRICT_WWM1:%[0-9]+]]:vreg_64_align2 = STRICT_WWM [[V_MOV_B6]], implicit $exec + ; GFX90A_DPP-NEXT: [[COPY26:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE4]] + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_6:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[COPY26]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A_DPP-NEXT: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX90A_DPP-NEXT: [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_6]].sub0 + ; GFX90A_DPP-NEXT: [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_6]].sub1 + ; GFX90A_DPP-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY29]], 0, [[COPY27]], [[V_CMP_EQ_U32_e64_]], implicit $exec + ; GFX90A_DPP-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY30]], 0, [[COPY28]], [[V_CMP_EQ_U32_e64_]], implicit $exec + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_CNDMASK_B32_e64_]], %subreg.sub0, [[V_CNDMASK_B32_e64_1]], %subreg.sub1 + ; GFX90A_DPP-NEXT: S_BRANCH %bb.4 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.6 (%ir-block.41): + ; GFX90A_DPP-NEXT: [[COPY31:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0 + ; GFX90A_DPP-NEXT: [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1 + ; GFX90A_DPP-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY31]], implicit $exec + ; GFX90A_DPP-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]] + ; GFX90A_DPP-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY32]], implicit $exec + ; GFX90A_DPP-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_3]] + ; GFX90A_DPP-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; + ; GFX940_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw + ; GFX940_ITERATIVE: bb.1 (%ir-block.0): + ; GFX940_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.6(0x40000000) + ; GFX940_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX940_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940_ITERATIVE-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE + ; GFX940_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.2 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.2 (%ir-block.5): + ; GFX940_ITERATIVE-NEXT: successors: %bb.7(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.7 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.3 (%ir-block.7): + ; GFX940_ITERATIVE-NEXT: successors: %bb.4(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], %27, [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.4 (%ir-block.9): + ; GFX940_ITERATIVE-NEXT: successors: %bb.6(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.3, [[DEF]], %bb.8 + ; GFX940_ITERATIVE-NEXT: SI_END_CF %37, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1 + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[COPY7]], 0, %26, 0, 0, implicit $mode, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX940_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX940_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_]].sub1 + ; GFX940_ITERATIVE-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY10]], 0, [[COPY8]], %35, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY11]], 0, [[COPY9]], %35, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_CNDMASK_B32_e64_]], %subreg.sub0, [[V_CNDMASK_B32_e64_1]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.6 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.5 (%ir-block.14): + ; GFX940_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY %43.sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY %43.sub1 + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec + ; GFX940_ITERATIVE-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]] + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec + ; GFX940_ITERATIVE-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_3]] + ; GFX940_ITERATIVE-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.6.Flow: + ; GFX940_ITERATIVE-NEXT: successors: %bb.5(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[REG_SEQUENCE2]], %bb.4, [[DEF]], %bb.1 + ; GFX940_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.5 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.7.ComputeLoop: + ; GFX940_ITERATIVE-NEXT: successors: %bb.8(0x04000000), %bb.7(0x7c000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI %18, %bb.7, [[S_MOV_B]], %bb.2 + ; GFX940_ITERATIVE-NEXT: [[PHI3:%[0-9]+]]:vreg_64_align2 = PHI %17, %bb.7, [[DEF]], %bb.2 + ; GFX940_ITERATIVE-NEXT: [[PHI4:%[0-9]+]]:vreg_64_align2 = PHI %23, %bb.7, [[COPY4]], %bb.2 + ; GFX940_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub1 + ; GFX940_ITERATIVE-NEXT: [[V_FFBL_B32_e64_:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY14]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_FFBL_B32_e64_1:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY15]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 32, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_FFBL_B32_e64_1]], [[V_MOV_B32_e32_1]], 0, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_MIN_U32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U32_e64 [[V_FFBL_B32_e64_]], [[V_ADD_U32_e64_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY2]], [[V_READFIRSTLANE_B32_4]] + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY3]], [[V_READFIRSTLANE_B32_5]] + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub1 + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY16]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: $m0 = COPY [[V_READFIRSTLANE_B32_7]] + ; GFX940_ITERATIVE-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_6]], $m0, [[COPY18]] + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY17]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: $m0 = COPY [[V_READFIRSTLANE_B32_9]] + ; GFX940_ITERATIVE-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_8]], $m0, [[COPY19]] + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_WRITELANE_B32_]], %subreg.sub0, [[V_WRITELANE_B32_1]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY20:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] + ; GFX940_ITERATIVE-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI2]], 0, [[COPY20]], 0, 0, implicit $mode, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1 + ; GFX940_ITERATIVE-NEXT: [[COPY21:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B1]] + ; GFX940_ITERATIVE-NEXT: [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[V_MIN_U32_e64_]], [[COPY21]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub1 + ; GFX940_ITERATIVE-NEXT: [[V_NOT_B32_e32_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY22]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_NOT_B32_e32_1:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY23]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub1 + ; GFX940_ITERATIVE-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY24]], [[V_NOT_B32_e32_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY25]], [[V_NOT_B32_e32_1]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0 + ; GFX940_ITERATIVE-NEXT: [[COPY26:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B2]] + ; GFX940_ITERATIVE-NEXT: [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U64_e64 [[REG_SEQUENCE5]], [[COPY26]], implicit $exec + ; GFX940_ITERATIVE-NEXT: $vcc = COPY [[V_CMP_NE_U64_e64_]] + ; GFX940_ITERATIVE-NEXT: S_CBRANCH_VCCNZ %bb.7, implicit $vcc + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.8 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.8.ComputeEnd: + ; GFX940_ITERATIVE-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[PHI5:%[0-9]+]]:vreg_64_align2 = PHI [[REG_SEQUENCE4]], %bb.7 + ; GFX940_ITERATIVE-NEXT: [[PHI6:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_1]], %bb.7 + ; GFX940_ITERATIVE-NEXT: [[COPY27:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY28:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY29:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1 + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY29]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY30:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE6]].sub0 + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940_ITERATIVE-NEXT: [[COPY31:%[0-9]+]]:vgpr_32 = COPY [[COPY27]] + ; GFX940_ITERATIVE-NEXT: [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY31]], [[COPY32]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY33:%[0-9]+]]:vgpr_32 = COPY [[COPY30]] + ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY33]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY34:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY34]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.3 + ; + ; GFX940_DPP-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw + ; GFX940_DPP: bb.1 (%ir-block.0): + ; GFX940_DPP-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; GFX940_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX940_DPP-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX940_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940_DPP-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX940_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE + ; GFX940_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_DPP-NEXT: S_BRANCH %bb.2 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.2 (%ir-block.5): + ; GFX940_DPP-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX940_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX940_DPP-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX940_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1 + ; GFX940_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0 + ; GFX940_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940_DPP-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX940_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY9]], [[COPY10]], implicit $exec + ; GFX940_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] + ; GFX940_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY11]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX940_DPP-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX940_DPP-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[REG_SEQUENCE1]], [[COPY12]], implicit-def dead $scc, implicit $exec + ; GFX940_DPP-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY13]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, [[V_MOV_B]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY14]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY15:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY15]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY16:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY16]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY17:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY17]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY18:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY18]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY19:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY19]], [[V_ADD_F64_e64_5]], 312, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX940_DPP-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0 + ; GFX940_DPP-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1 + ; GFX940_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY20]], [[S_MOV_B32_2]] + ; GFX940_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY21]], [[S_MOV_B32_2]] + ; GFX940_DPP-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[COPY22:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] + ; GFX940_DPP-NEXT: [[STRICT_WWM:%[0-9]+]]:vreg_64_align2 = STRICT_WWM [[COPY22]], implicit $exec + ; GFX940_DPP-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY23]], implicit $exec + ; GFX940_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_DPP-NEXT: S_BRANCH %bb.3 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.3 (%ir-block.32): + ; GFX940_DPP-NEXT: successors: %bb.5(0x80000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940_DPP-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX940_DPP-NEXT: S_BRANCH %bb.5 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.4.Flow: + ; GFX940_DPP-NEXT: successors: %bb.6(0x80000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI %44, %bb.5, [[DEF]], %bb.1 + ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_DPP-NEXT: S_BRANCH %bb.6 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.5 (%ir-block.35): + ; GFX940_DPP-NEXT: successors: %bb.4(0x80000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.3, [[DEF]], %bb.2 + ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_DPP-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0 + ; GFX940_DPP-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1 + ; GFX940_DPP-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY24]], implicit $exec + ; GFX940_DPP-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY25]], implicit $exec + ; GFX940_DPP-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[STRICT_WWM1:%[0-9]+]]:vreg_64_align2 = STRICT_WWM [[V_MOV_B6]], implicit $exec + ; GFX940_DPP-NEXT: [[COPY26:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE4]] + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_6:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[COPY26]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX940_DPP-NEXT: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX940_DPP-NEXT: [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_6]].sub0 + ; GFX940_DPP-NEXT: [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_6]].sub1 + ; GFX940_DPP-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY29]], 0, [[COPY27]], [[V_CMP_EQ_U32_e64_]], implicit $exec + ; GFX940_DPP-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY30]], 0, [[COPY28]], [[V_CMP_EQ_U32_e64_]], implicit $exec + ; GFX940_DPP-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_CNDMASK_B32_e64_]], %subreg.sub0, [[V_CNDMASK_B32_e64_1]], %subreg.sub1 + ; GFX940_DPP-NEXT: S_BRANCH %bb.4 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.6 (%ir-block.41): + ; GFX940_DPP-NEXT: [[COPY31:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0 + ; GFX940_DPP-NEXT: [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1 + ; GFX940_DPP-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY31]], implicit $exec + ; GFX940_DPP-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]] + ; GFX940_DPP-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY32]], implicit $exec + ; GFX940_DPP-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_3]] + ; GFX940_DPP-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic ret double %ret } diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll index fb764560154d5..b1134ae78cb97 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -574,13 +574,44 @@ entry: define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_mov_b64 s[0:1], exec +; GFX6-NEXT: s_mov_b32 s4, 0 +; GFX6-NEXT: ; implicit-def: $vgpr1 +; GFX6-NEXT: .LBB2_1: ; %ComputeLoop +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX6-NEXT: s_mov_b32 m0, s5 +; GFX6-NEXT: v_readlane_b32 s8, v0, s5 +; GFX6-NEXT: v_writelane_b32 v1, s4, m0 +; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX6-NEXT: s_add_i32 s4, s4, s8 +; GFX6-NEXT: s_cbranch_vccnz .LBB2_1 +; GFX6-NEXT: ; %bb.2: ; %ComputeEnd +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: ; implicit-def: $vgpr0 +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX6-NEXT: s_cbranch_execz .LBB2_4 +; GFX6-NEXT: ; %bb.3: +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; GFX6-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX6-NEXT: .LBB2_4: +; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_readfirstlane_b32 s4, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -937,15 +968,46 @@ entry: define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %vindex) { ; GFX6-LABEL: struct_add_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[2:3], 0x11 -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_mov_b64 s[0:1], exec +; GFX6-NEXT: s_mov_b32 s4, 0 +; GFX6-NEXT: ; implicit-def: $vgpr1 +; GFX6-NEXT: .LBB3_1: ; %ComputeLoop +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX6-NEXT: s_mov_b32 m0, s5 +; GFX6-NEXT: v_readlane_b32 s8, v0, s5 +; GFX6-NEXT: v_writelane_b32 v1, s4, m0 +; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX6-NEXT: s_add_i32 s4, s4, s8 +; GFX6-NEXT: s_cbranch_vccnz .LBB3_1 +; GFX6-NEXT: ; %bb.2: ; %ComputeEnd +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: ; implicit-def: $vgpr0 +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX6-NEXT: s_cbranch_execz .LBB3_4 +; GFX6-NEXT: ; %bb.3: +; GFX6-NEXT: s_load_dword s5, s[2:3], 0x11 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s8 -; GFX6-NEXT: buffer_atomic_add v0, v1, s[4:7], 0 idxen glc +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc +; GFX6-NEXT: .LBB3_4: +; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_readfirstlane_b32 s4, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -2011,13 +2073,44 @@ entry: define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_mov_b64 s[0:1], exec +; GFX6-NEXT: s_mov_b32 s4, 0 +; GFX6-NEXT: ; implicit-def: $vgpr1 +; GFX6-NEXT: .LBB7_1: ; %ComputeLoop +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX6-NEXT: s_mov_b32 m0, s5 +; GFX6-NEXT: v_readlane_b32 s8, v0, s5 +; GFX6-NEXT: v_writelane_b32 v1, s4, m0 +; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX6-NEXT: s_add_i32 s4, s4, s8 +; GFX6-NEXT: s_cbranch_vccnz .LBB7_1 +; GFX6-NEXT: ; %bb.2: ; %ComputeEnd +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: ; implicit-def: $vgpr0 +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX6-NEXT: s_cbranch_execz .LBB7_4 +; GFX6-NEXT: ; %bb.3: +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc +; GFX6-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX6-NEXT: .LBB7_4: +; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_readfirstlane_b32 s4, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index ca4812f345958..de7fc76b95cf8 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -1,13 +1,22 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX1264 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX1232 %s +; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232_DPP %s declare i32 @llvm.amdgcn.workitem.id.x() @@ -46,37 +55,69 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; -; GFX89-LABEL: add_i32_constant: -; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX89-NEXT: s_mov_b64 s[6:7], exec -; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX89-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 -; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX89-NEXT: ; implicit-def: $vgpr1 -; GFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX89-NEXT: s_cbranch_execz .LBB0_2 -; GFX89-NEXT: ; %bb.1: -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s8, s2 -; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[6:7] -; GFX89-NEXT: s_mul_i32 s2, s2, 5 -; GFX89-NEXT: s_mov_b32 s11, 0xf000 -; GFX89-NEXT: s_mov_b32 s10, -1 -; GFX89-NEXT: s_mov_b32 s9, s3 -; GFX89-NEXT: v_mov_b32_e32 v1, s2 -; GFX89-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: buffer_wbinvl1_vol -; GFX89-NEXT: .LBB0_2: -; GFX89-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX89-NEXT: v_readfirstlane_b32 s4, v1 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s3, 0xf000 -; GFX89-NEXT: s_mov_b32 s2, -1 -; GFX89-NEXT: v_mad_u32_u24 v0, v0, 5, s4 -; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX89-NEXT: s_endpgm +; GFX8-LABEL: add_i32_constant: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_cbranch_execz .LBB0_2 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s8, s2 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX8-NEXT: s_mul_i32 s2, s2, 5 +; GFX8-NEXT: s_mov_b32 s11, 0xf000 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s9, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: .LBB0_2: +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_readfirstlane_b32 s4, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s4 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: add_i32_constant: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB0_2 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX9-NEXT: s_mul_i32 s2, s2, 5 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: .LBB0_2: +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: add_i32_constant: ; GFX1064: ; %bb.0: ; %entry @@ -626,642 +667,1226 @@ entry: } define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) { -; GFX7LESS-LABEL: add_i32_varying: +; GFX7LESS_ITERATIVE-LABEL: add_i32_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s6, 0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s4 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s4 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB2_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS_ITERATIVE-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_wbinvl1 +; GFX7LESS_ITERATIVE-NEXT: .LBB2_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt expcnt(0) +; GFX7LESS_ITERATIVE-NEXT: v_add_i32_e32 v0, vcc, s4, v1 +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm +; +; GFX8_ITERATIVE-LABEL: add_i32_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_mov_b32 s6, 0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX8_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s4 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s4 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 +; GFX8_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 +; GFX8_ITERATIVE-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX8_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_wbinvl1_vol +; GFX8_ITERATIVE-NEXT: .LBB2_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_add_u32_e32 v0, vcc, s4, v1 +; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm +; +; GFX9_ITERATIVE-LABEL: add_i32_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b32 s6, 0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX9_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s4 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s4 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 +; GFX9_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 +; GFX9_ITERATIVE-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX9_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_wbinvl1_vol +; GFX9_ITERATIVE-NEXT: .LBB2_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_add_u32_e32 v0, s4, v1 +; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm +; +; GFX1064_ITERATIVE-LABEL: add_i32_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s6, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1064_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s7, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s7 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s7 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX1064_ITERATIVE-NEXT: s_add_i32 s6, s6, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1064_ITERATIVE-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX1064_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl1_inv +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB2_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm +; +; GFX1032_ITERATIVE-LABEL: add_i32_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1032_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s1 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s1 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s1 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s6 +; GFX1032_ITERATIVE-NEXT: s_add_i32 s4, s4, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1032_ITERATIVE-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX1032_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl1_inv +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB2_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm +; +; GFX1164_ITERATIVE-LABEL: add_i32_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s6, 0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s7 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1164_ITERATIVE-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc +; GFX1164_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB2_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm +; +; GFX1132_ITERATIVE-LABEL: add_i32_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s1 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s1 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s6 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: s_add_i32 s4, s4, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v1, s4 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1132_ITERATIVE-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc +; GFX1132_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl1_inv +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB2_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm +; +; GFX1264_ITERATIVE-LABEL: add_i32_varying: +; GFX1264_ITERATIVE: ; %bb.0: ; %entry +; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s6, 0 +; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1264_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop +; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 +; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s7 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 +; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8 +; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1264_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1264_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX1264_ITERATIVE-NEXT: ; %bb.3: +; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1264_ITERATIVE-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1264_ITERATIVE-NEXT: s_wait_loadcnt 0x0 +; GFX1264_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV +; GFX1264_ITERATIVE-NEXT: .LBB2_4: +; GFX1264_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 +; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1264_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null +; GFX1264_ITERATIVE-NEXT: s_nop 0 +; GFX1264_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1264_ITERATIVE-NEXT: s_endpgm +; +; GFX1232_ITERATIVE-LABEL: add_i32_varying: +; GFX1232_ITERATIVE: ; %bb.0: ; %entry +; GFX1232_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1232_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop +; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s1 +; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s1 +; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1 +; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s6 +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s4, s4, s5 +; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1232_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1232_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1232_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5 +; GFX1232_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX1232_ITERATIVE-NEXT: ; %bb.3: +; GFX1232_ITERATIVE-NEXT: v_mov_b32_e32 v1, s4 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1232_ITERATIVE-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0 +; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV +; GFX1232_ITERATIVE-NEXT: .LBB2_4: +; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 +; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1232_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null +; GFX1232_ITERATIVE-NEXT: s_nop 0 +; GFX1232_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1232_ITERATIVE-NEXT: s_endpgm +; +; GFX7LESS_DPP-LABEL: add_i32_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_mov_b32 s7, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s6, -1 +; GFX7LESS_DPP-NEXT: s_mov_b32 s10, s6 +; GFX7LESS_DPP-NEXT: s_mov_b32 s11, s7 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s8, s2 +; GFX7LESS_DPP-NEXT: s_mov_b32 s9, s3 +; GFX7LESS_DPP-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX7LESS_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS_DPP-NEXT: buffer_wbinvl1 +; GFX7LESS_DPP-NEXT: s_mov_b32 s4, s0 +; GFX7LESS_DPP-NEXT: s_mov_b32 s5, s1 +; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7LESS_DPP-NEXT: s_endpgm +; +; GFX8_DPP-LABEL: add_i32_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s6, v2, 63 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: s_mov_b32 s11, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s10, -1 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: s_mov_b32 s8, s2 +; GFX8_DPP-NEXT: s_mov_b32 s9, s3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX8_DPP-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX8_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX8_DPP-NEXT: buffer_wbinvl1_vol +; GFX8_DPP-NEXT: .LBB2_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: add_i32_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s6, v2, 63 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: s_mov_b32 s11, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s10, -1 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: s_mov_b32 s8, s2 +; GFX9_DPP-NEXT: s_mov_b32 s9, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX9_DPP-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX9_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9_DPP-NEXT: buffer_wbinvl1_vol +; GFX9_DPP-NEXT: .LBB2_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: add_i32_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1064_DPP-NEXT: s_mov_b32 s4, s9 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s8, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1064_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1064_DPP-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; GFX1064_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl1_inv +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB2_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_DPP-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: add_i32_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s5, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s4, s6 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1032_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1032_DPP-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl1_inv +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB2_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_DPP-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: add_i32_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1164_DPP-NEXT: s_mov_b32 s4, s9 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s8, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1164_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1164_DPP-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc +; GFX1164_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl1_inv +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB2_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: add_i32_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s4, s6 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1132_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1132_DPP-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc +; GFX1132_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl1_inv +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB2_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm +; +; GFX1264_DPP-LABEL: add_i32_varying: +; GFX1264_DPP: ; %bb.0: ; %entry +; GFX1264_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1264_DPP-NEXT: s_not_b64 exec, exec +; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1264_DPP-NEXT: s_not_b64 exec, exec +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1264_DPP-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1264_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1264_DPP-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1264_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1264_DPP-NEXT: v_readlane_b32 s9, v1, 63 +; GFX1264_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1264_DPP-NEXT: s_mov_b32 s4, s9 +; GFX1264_DPP-NEXT: v_writelane_b32 v3, s8, 48 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1264_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1264_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1264_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1264_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX1264_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1264_DPP-NEXT: ; %bb.1: +; GFX1264_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1264_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 +; GFX1264_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1264_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1264_DPP-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0 +; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV +; GFX1264_DPP-NEXT: .LBB2_2: +; GFX1264_DPP-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 +; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1264_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1264_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1264_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], null +; GFX1264_DPP-NEXT: s_nop 0 +; GFX1264_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1264_DPP-NEXT: s_endpgm +; +; GFX1232_DPP-LABEL: add_i32_varying: +; GFX1232_DPP: ; %bb.0: ; %entry +; GFX1232_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16 +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1232_DPP-NEXT: s_mov_b32 s4, s6 +; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1232_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1232_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo +; GFX1232_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1232_DPP-NEXT: ; %bb.1: +; GFX1232_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1232_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 +; GFX1232_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1232_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1232_DPP-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX1232_DPP-NEXT: s_wait_loadcnt 0x0 +; GFX1232_DPP-NEXT: global_inv scope:SCOPE_DEV +; GFX1232_DPP-NEXT: .LBB2_2: +; GFX1232_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 +; GFX1232_DPP-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1232_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1232_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1232_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], null +; GFX1232_DPP-NEXT: s_nop 0 +; GFX1232_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1232_DPP-NEXT: s_endpgm +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = atomicrmw add ptr addrspace(1) %inout, i32 %lane syncscope("agent") acq_rel + store i32 %old, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace(1) %inout) { +; GFX7LESS-LABEL: add_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry +; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s6, -1 -; GFX7LESS-NEXT: s_mov_b32 s10, s6 -; GFX7LESS-NEXT: s_mov_b32 s11, s7 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB3_2 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 +; GFX7LESS-NEXT: s_mov_b32 s10, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s8, s2 ; GFX7LESS-NEXT: s_mov_b32 s9, s3 -; GFX7LESS-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 -; GFX7LESS-NEXT: s_mov_b32 s4, s0 -; GFX7LESS-NEXT: s_mov_b32 s5, s1 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7LESS-NEXT: .LBB3_2: +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s5, v0 +; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; -; GFX8-LABEL: add_i32_varying: +; GFX8-LABEL: add_i64_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: s_mov_b32 s6, 0 -; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: .LBB2_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX8-NEXT: s_mov_b32 m0, s4 -; GFX8-NEXT: v_readlane_b32 s7, v0, s4 -; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX8-NEXT: v_writelane_b32 v1, s6, m0 -; GFX8-NEXT: s_add_i32 s6, s6, s7 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0 +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX8-NEXT: s_cbranch_execz .LBB2_4 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_mov_b32 s11, 0xf000 -; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_cbranch_execz .LBB3_2 +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s8, s2 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX8-NEXT: s_mul_i32 s2, s2, 5 +; GFX8-NEXT: s_mov_b32 s11, 0xf000 +; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_mov_b32 s9, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: .LBB2_4: +; GFX8-NEXT: .LBB3_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s2, v1 +; GFX8-NEXT: v_readfirstlane_b32 s3, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v1 -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: s_nop 2 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; -; GFX9-LABEL: add_i32_varying: +; GFX9-LABEL: add_i64_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s6, 0 -; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: .LBB2_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX9-NEXT: s_mov_b32 m0, s4 -; GFX9-NEXT: v_readlane_b32 s7, v0, s4 -; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX9-NEXT: v_writelane_b32 v1, s6, m0 -; GFX9-NEXT: s_add_i32 s6, s6, s7 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_4 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_cbranch_execz .LBB3_2 +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX9-NEXT: s_mul_i32 s2, s2, 5 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s9, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: .LBB2_4: +; GFX9-NEXT: .LBB3_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s2, v1 +; GFX9-NEXT: v_readfirstlane_b32 s3, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_add_u32_e32 v0, s4, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_nop 2 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; -; GFX1064-LABEL: add_i32_varying: +; GFX1064-LABEL: add_i64_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s6, 0 -; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: .LBB2_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s7, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s8, v0, s7 -; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s7 -; GFX1064-NEXT: v_writelane_b32 v1, s6, s7 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX1064-NEXT: s_add_i32 s6, s6, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: s_cbranch_scc1 .LBB2_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0 +; GFX1064-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1064-NEXT: s_cbranch_execz .LBB2_4 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064-NEXT: s_cbranch_execz .LBB3_2 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: s_mul_i32 s6, s6, 5 ; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1064-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064-NEXT: s_mov_b32 s10, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_mov_b32 s8, s2 ; GFX1064-NEXT: s_mov_b32 s9, s3 -; GFX1064-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX1064-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB2_4: +; GFX1064-NEXT: .LBB3_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3] ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_add_nc_u32_e32 v0, s2, v1 ; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; -; GFX1032-LABEL: add_i32_varying: +; GFX1032-LABEL: add_i64_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: .LBB2_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032-NEXT: v_readlane_b32 s5, v0, s1 -; GFX1032-NEXT: s_lshl_b32 s6, 1, s1 -; GFX1032-NEXT: v_writelane_b32 v1, s4, s1 -; GFX1032-NEXT: s_andn2_b32 s0, s0, s6 -; GFX1032-NEXT: s_add_i32 s4, s4, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1032-NEXT: s_cbranch_scc1 .LBB2_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1032-NEXT: s_xor_b32 s5, exec_lo, s5 -; GFX1032-NEXT: s_cbranch_execz .LBB2_4 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032-NEXT: s_mov_b32 s5, exec_lo +; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB3_2 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: s_mul_i32 s5, s5, 5 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1032-NEXT: v_mov_b32_e32 v0, s5 ; GFX1032-NEXT: s_mov_b32 s10, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mov_b32 s8, s2 ; GFX1032-NEXT: s_mov_b32 s9, s3 -; GFX1032-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX1032-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB2_4: +; GFX1032-NEXT: .LBB3_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_add_nc_u32_e32 v0, s2, v1 ; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; -; GFX1164-LABEL: add_i32_varying: +; GFX1164-LABEL: add_i64_constant: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_mov_b32 s6, 0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: .LBB2_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s7, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s8, v1, s7 -; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s7 -; GFX1164-NEXT: v_writelane_b32 v0, s6, s7 -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-NEXT: s_add_i32 s6, s6, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: s_cbranch_scc1 .LBB2_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[6:7], exec +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1164-NEXT: s_cbranch_execz .LBB2_4 -; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v1, s6 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_cbranch_execz .LBB3_2 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: s_mul_i32 s6, s6, 5 ; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164-NEXT: v_mov_b32_e32 v0, s6 ; GFX1164-NEXT: s_mov_b32 s10, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_mov_b32 s8, s2 ; GFX1164-NEXT: s_mov_b32 s9, s3 -; GFX1164-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc +; GFX1164-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB2_4: +; GFX1164-NEXT: .LBB3_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] +; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; -; GFX1132-LABEL: add_i32_varying: +; GFX1132-LABEL: add_i64_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: .LBB2_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s5, v1, s1 -; GFX1132-NEXT: s_lshl_b32 s6, 1, s1 -; GFX1132-NEXT: v_writelane_b32 v0, s4, s1 -; GFX1132-NEXT: s_and_not1_b32 s0, s0, s6 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: s_add_i32 s4, s4, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1132-NEXT: s_cbranch_scc1 .LBB2_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1132-NEXT: s_xor_b32 s5, exec_lo, s5 -; GFX1132-NEXT: s_cbranch_execz .LBB2_4 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_mov_b32_e32 v1, s4 +; GFX1132-NEXT: s_mov_b32 s5, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_cbranch_execz .LBB3_2 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1132-NEXT: s_mul_i32 s5, s5, 5 ; GFX1132-NEXT: s_mov_b32 s10, -1 +; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mov_b32 s8, s2 ; GFX1132-NEXT: s_mov_b32 s9, s3 -; GFX1132-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc +; GFX1132-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB2_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1132-NEXT: .LBB3_2: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] +; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; -; GFX1264-LABEL: add_i32_varying: -; GFX1264: ; %bb.0: ; %entry -; GFX1264-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1264-NEXT: s_mov_b64 s[0:1], exec -; GFX1264-NEXT: s_mov_b32 s6, 0 -; GFX1264-NEXT: ; implicit-def: $vgpr0 -; GFX1264-NEXT: .LBB2_1: ; %ComputeLoop -; GFX1264-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264-NEXT: s_ctz_i32_b64 s7, s[0:1] -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1264-NEXT: v_readlane_b32 s8, v1, s7 -; GFX1264-NEXT: s_lshl_b64 s[4:5], 1, s7 -; GFX1264-NEXT: v_writelane_b32 v0, s6, s7 -; GFX1264-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1264-NEXT: s_add_co_i32 s6, s6, s8 -; GFX1264-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1264-NEXT: s_cbranch_scc1 .LBB2_1 -; GFX1264-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1264-NEXT: ; implicit-def: $vgpr1 -; GFX1264-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1264-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1264-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1264-NEXT: s_cbranch_execz .LBB2_4 -; GFX1264-NEXT: ; %bb.3: -; GFX1264-NEXT: v_mov_b32_e32 v1, s6 -; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1264-NEXT: s_mov_b32 s10, -1 -; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_mov_b32 s8, s2 -; GFX1264-NEXT: s_mov_b32 s9, s3 -; GFX1264-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX1264-NEXT: s_wait_loadcnt 0x0 -; GFX1264-NEXT: global_inv scope:SCOPE_DEV -; GFX1264-NEXT: .LBB2_4: -; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264-NEXT: v_add_nc_u32_e32 v0, s2, v0 -; GFX1264-NEXT: s_mov_b32 s2, -1 -; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null -; GFX1264-NEXT: s_nop 0 -; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1264-NEXT: s_endpgm -; -; GFX1232-LABEL: add_i32_varying: -; GFX1232: ; %bb.0: ; %entry -; GFX1232-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1232-NEXT: s_mov_b32 s0, exec_lo -; GFX1232-NEXT: s_mov_b32 s4, 0 -; GFX1232-NEXT: ; implicit-def: $vgpr0 -; GFX1232-NEXT: .LBB2_1: ; %ComputeLoop -; GFX1232-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1232-NEXT: v_readlane_b32 s5, v1, s1 -; GFX1232-NEXT: s_lshl_b32 s6, 1, s1 -; GFX1232-NEXT: v_writelane_b32 v0, s4, s1 -; GFX1232-NEXT: s_and_not1_b32 s0, s0, s6 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1232-NEXT: s_add_co_i32 s4, s4, s5 -; GFX1232-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1232-NEXT: s_cbranch_scc1 .LBB2_1 -; GFX1232-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1232-NEXT: ; implicit-def: $vgpr1 -; GFX1232-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1232-NEXT: s_xor_b32 s5, exec_lo, s5 -; GFX1232-NEXT: s_cbranch_execz .LBB2_4 -; GFX1232-NEXT: ; %bb.3: -; GFX1232-NEXT: v_mov_b32_e32 v1, s4 -; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1232-NEXT: s_mov_b32 s10, -1 -; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: s_mov_b32 s8, s2 -; GFX1232-NEXT: s_mov_b32 s9, s3 -; GFX1232-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX1232-NEXT: s_wait_loadcnt 0x0 -; GFX1232-NEXT: global_inv scope:SCOPE_DEV -; GFX1232-NEXT: .LBB2_4: -; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-NEXT: v_add_nc_u32_e32 v0, s2, v0 -; GFX1232-NEXT: s_mov_b32 s2, -1 -; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null -; GFX1232-NEXT: s_nop 0 -; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1232-NEXT: s_endpgm -entry: - %lane = call i32 @llvm.amdgcn.workitem.id.x() - %old = atomicrmw add ptr addrspace(1) %inout, i32 %lane syncscope("agent") acq_rel - store i32 %old, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace(1) %inout) { -; GFX7LESS-LABEL: add_i64_constant: -; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB3_2 -; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 -; GFX7LESS-NEXT: s_mov_b32 s10, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s8, s2 -; GFX7LESS-NEXT: s_mov_b32 s9, s3 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 -; GFX7LESS-NEXT: .LBB3_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 -; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s5, v0 -; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc -; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX7LESS-NEXT: s_endpgm -; -; GFX89-LABEL: add_i64_constant: -; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX89-NEXT: s_mov_b64 s[6:7], exec -; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX89-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 -; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX89-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX89-NEXT: s_cbranch_execz .LBB3_2 -; GFX89-NEXT: ; %bb.1: -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s8, s2 -; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[6:7] -; GFX89-NEXT: s_mul_i32 s2, s2, 5 -; GFX89-NEXT: s_mov_b32 s11, 0xf000 -; GFX89-NEXT: s_mov_b32 s10, -1 -; GFX89-NEXT: s_mov_b32 s9, s3 -; GFX89-NEXT: v_mov_b32_e32 v0, s2 -; GFX89-NEXT: v_mov_b32_e32 v1, 0 -; GFX89-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: buffer_wbinvl1_vol -; GFX89-NEXT: .LBB3_2: -; GFX89-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_readfirstlane_b32 s2, v1 -; GFX89-NEXT: v_readfirstlane_b32 s3, v0 -; GFX89-NEXT: v_mov_b32_e32 v0, s3 -; GFX89-NEXT: v_mov_b32_e32 v1, s2 -; GFX89-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] -; GFX89-NEXT: s_mov_b32 s3, 0xf000 -; GFX89-NEXT: s_mov_b32 s2, -1 -; GFX89-NEXT: s_nop 2 -; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX89-NEXT: s_endpgm -; -; GFX1064-LABEL: add_i64_constant: -; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b64 s[6:7], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB3_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_mul_i32 s6, s6, 5 -; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1064-NEXT: v_mov_b32_e32 v0, s6 -; GFX1064-NEXT: s_mov_b32 s10, -1 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s8, s2 -; GFX1064-NEXT: s_mov_b32 s9, s3 -; GFX1064-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: buffer_gl1_inv -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB3_2: -; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3] -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: add_i64_constant: -; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB3_2 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mul_i32 s5, s5, 5 -; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1032-NEXT: v_mov_b32_e32 v0, s5 -; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s8, s2 -; GFX1032-NEXT: s_mov_b32 s9, s3 -; GFX1032-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: buffer_gl1_inv -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB3_2: -; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: add_i64_constant: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1164-NEXT: s_mov_b64 s[6:7], exec -; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 -; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB3_2 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_mul_i32 s6, s6, 5 -; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1164-NEXT: v_mov_b32_e32 v0, s6 -; GFX1164-NEXT: s_mov_b32 s10, -1 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mov_b32 s8, s2 -; GFX1164-NEXT: s_mov_b32 s9, s3 -; GFX1164-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: buffer_gl1_inv -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB3_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: add_i64_constant: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1132-NEXT: s_mov_b32 s5, exec_lo -; GFX1132-NEXT: s_mov_b32 s4, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB3_2 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 -; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1132-NEXT: s_mul_i32 s5, s5, 5 -; GFX1132-NEXT: s_mov_b32 s10, -1 -; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mov_b32 s8, s2 -; GFX1132-NEXT: s_mov_b32 s9, s3 -; GFX1132-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: buffer_gl1_inv -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB3_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1132-NEXT: s_endpgm -; -; GFX1264-LABEL: add_i64_constant: +; GFX1264-LABEL: add_i64_constant: ; GFX1264: ; %bb.0: ; %entry ; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1264-NEXT: s_mov_b64 s[6:7], exec @@ -1733,151 +2358,1363 @@ entry: } define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) { -; GFX7LESS-LABEL: add_i64_varying: -; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s6, -1 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: s_mov_b32 s10, s6 -; GFX7LESS-NEXT: s_mov_b32 s11, s7 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s8, s2 -; GFX7LESS-NEXT: s_mov_b32 s9, s3 -; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 -; GFX7LESS-NEXT: s_mov_b32 s4, s0 -; GFX7LESS-NEXT: s_mov_b32 s5, s1 -; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX7LESS-NEXT: s_endpgm +; GFX7LESS_ITERATIVE-LABEL: add_i64_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s6 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s5, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX7LESS_ITERATIVE-NEXT: s_add_u32 s4, s4, s8 +; GFX7LESS_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB5_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5 +; GFX7LESS_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_wbinvl1 +; GFX7LESS_ITERATIVE-NEXT: .LBB5_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt expcnt(0) +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX7LESS_ITERATIVE-NEXT: v_add_i32_e32 v0, vcc, s5, v1 +; GFX7LESS_ITERATIVE-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm ; -; GFX89-LABEL: add_i64_varying: -; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: s_mov_b32 s10, s6 -; GFX89-NEXT: s_mov_b32 s11, s7 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s8, s2 -; GFX89-NEXT: s_mov_b32 s9, s3 -; GFX89-NEXT: v_mov_b32_e32 v1, 0 -; GFX89-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: buffer_wbinvl1_vol -; GFX89-NEXT: s_mov_b32 s4, s0 -; GFX89-NEXT: s_mov_b32 s5, s1 -; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX89-NEXT: s_endpgm -; -; GFX10-LABEL: add_i64_varying: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 -; GFX10-NEXT: s_mov_b32 s11, s7 -; GFX10-NEXT: s_mov_b32 s10, s6 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s8, s2 -; GFX10-NEXT: s_mov_b32 s9, s3 -; GFX10-NEXT: s_mov_b32 s4, s0 -; GFX10-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_mov_b32 s5, s1 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX10-NEXT: s_endpgm -; -; GFX1164-LABEL: add_i64_varying: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1164-NEXT: s_mov_b32 s6, -1 -; GFX1164-NEXT: s_mov_b32 s11, s7 -; GFX1164-NEXT: s_mov_b32 s10, s6 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mov_b32 s8, s2 -; GFX1164-NEXT: s_mov_b32 s9, s3 -; GFX1164-NEXT: s_mov_b32 s4, s0 -; GFX1164-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: buffer_gl1_inv -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: s_mov_b32 s5, s1 -; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1164-NEXT: s_endpgm +; GFX8_ITERATIVE-LABEL: add_i64_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s6 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8_ITERATIVE-NEXT: s_add_u32 s4, s4, s8 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s5, m0 +; GFX8_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5 +; GFX8_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc +; GFX8_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_wbinvl1_vol +; GFX8_ITERATIVE-NEXT: .LBB5_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX8_ITERATIVE-NEXT: v_add_u32_e32 v0, vcc, s5, v1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc +; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm ; -; GFX1132-LABEL: add_i64_varying: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1132-NEXT: s_mov_b32 s6, -1 -; GFX1132-NEXT: s_mov_b32 s11, s7 -; GFX1132-NEXT: s_mov_b32 s10, s6 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mov_b32 s8, s2 -; GFX1132-NEXT: s_mov_b32 s9, s3 -; GFX1132-NEXT: s_mov_b32 s4, s0 -; GFX1132-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: buffer_gl1_inv -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: s_mov_b32 s5, s1 -; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1132-NEXT: s_endpgm +; GFX9_ITERATIVE-LABEL: add_i64_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s6 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9_ITERATIVE-NEXT: s_add_u32 s4, s4, s8 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s5, m0 +; GFX9_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5 +; GFX9_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc +; GFX9_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_wbinvl1_vol +; GFX9_ITERATIVE-NEXT: .LBB5_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX9_ITERATIVE-NEXT: v_add_co_u32_e32 v0, vcc, s5, v1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v2, vcc +; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm ; -; GFX1264-LABEL: add_i64_varying: -; GFX1264: ; %bb.0: ; %entry -; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1264-NEXT: v_mov_b32_e32 v1, 0 -; GFX1264-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1264-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1264-NEXT: s_mov_b32 s6, -1 -; GFX1264-NEXT: s_mov_b32 s11, s7 -; GFX1264-NEXT: s_mov_b32 s10, s6 -; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_mov_b32 s8, s2 -; GFX1264-NEXT: s_mov_b32 s9, s3 -; GFX1264-NEXT: s_mov_b32 s4, s0 -; GFX1264-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX1264-NEXT: s_wait_loadcnt 0x0 -; GFX1264-NEXT: global_inv scope:SCOPE_DEV -; GFX1264-NEXT: s_mov_b32 s5, s1 -; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null -; GFX1264-NEXT: s_nop 0 -; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1264-NEXT: s_endpgm +; GFX1064_ITERATIVE-LABEL: add_i64_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s6 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s6 +; GFX1064_ITERATIVE-NEXT: s_add_u32 s4, s4, s7 +; GFX1064_ITERATIVE-NEXT: s_addc_u32 s5, s5, s8 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1064_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc +; GFX1064_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl1_inv +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB5_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1064_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v1 +; GFX1064_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm ; -; GFX1232-LABEL: add_i64_varying: -; GFX1232: ; %bb.0: ; %entry -; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1232-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX1232-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1232-NEXT: s_mov_b32 s6, -1 -; GFX1232-NEXT: s_mov_b32 s11, s7 -; GFX1232-NEXT: s_mov_b32 s10, s6 -; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: s_mov_b32 s8, s2 -; GFX1232-NEXT: s_mov_b32 s9, s3 -; GFX1232-NEXT: s_mov_b32 s4, s0 -; GFX1232-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX1232-NEXT: s_wait_loadcnt 0x0 -; GFX1232-NEXT: global_inv scope:SCOPE_DEV -; GFX1232-NEXT: s_mov_b32 s5, s1 -; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null -; GFX1232-NEXT: s_nop 0 -; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1232-NEXT: s_endpgm +; GFX1032_ITERATIVE-LABEL: add_i64_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s1 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s1 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s1 +; GFX1032_ITERATIVE-NEXT: s_add_u32 s4, s4, s6 +; GFX1032_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s6, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s6, exec_lo, s6 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1032_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc +; GFX1032_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl1_inv +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB5_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1032_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1 +; GFX1032_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm +; +; GFX1164_ITERATIVE-LABEL: add_i64_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s6 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s5, s6 +; GFX1164_ITERATIVE-NEXT: s_add_u32 s4, s4, s7 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_addc_u32 s5, s5, s8 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s5 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1164_ITERATIVE-NEXT: buffer_atomic_add_u64 v[2:3], off, s[8:11], 0 glc +; GFX1164_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB5_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v0 +; GFX1164_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm +; +; GFX1132_ITERATIVE-LABEL: add_i64_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s1 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s1 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s5, s1 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_ITERATIVE-NEXT: s_add_u32 s4, s4, s6 +; GFX1132_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s6, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s6, exec_lo, s6 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1132_ITERATIVE-NEXT: buffer_atomic_add_u64 v[2:3], off, s[8:11], 0 glc +; GFX1132_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl1_inv +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB5_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s2, v0 +; GFX1132_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm +; +; GFX1264_ITERATIVE-LABEL: add_i64_varying: +; GFX1264_ITERATIVE: ; %bb.0: ; %entry +; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1264_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1] +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s5, s10 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s10 +; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[8:9] +; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7] +; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1264_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1264_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 +; GFX1264_ITERATIVE-NEXT: ; %bb.3: +; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, s5 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1264_ITERATIVE-NEXT: buffer_atomic_add_u64 v[2:3], off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1264_ITERATIVE-NEXT: s_wait_loadcnt 0x0 +; GFX1264_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV +; GFX1264_ITERATIVE-NEXT: .LBB5_4: +; GFX1264_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 +; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v0 +; GFX1264_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1264_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null +; GFX1264_ITERATIVE-NEXT: s_nop 0 +; GFX1264_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1264_ITERATIVE-NEXT: s_endpgm +; +; GFX1232_ITERATIVE-LABEL: add_i64_varying: +; GFX1232_ITERATIVE: ; %bb.0: ; %entry +; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1232_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1232_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s1 +; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s1 +; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1 +; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s5, s1 +; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1 +; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8 +; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7] +; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1232_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1232_ITERATIVE-NEXT: s_and_saveexec_b32 s6, vcc_lo +; GFX1232_ITERATIVE-NEXT: s_xor_b32 s6, exec_lo, s6 +; GFX1232_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 +; GFX1232_ITERATIVE-NEXT: ; %bb.3: +; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1232_ITERATIVE-NEXT: buffer_atomic_add_u64 v[2:3], off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0 +; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV +; GFX1232_ITERATIVE-NEXT: .LBB5_4: +; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 +; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1232_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s2, v0 +; GFX1232_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1232_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null +; GFX1232_ITERATIVE-NEXT: s_nop 0 +; GFX1232_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1232_ITERATIVE-NEXT: s_endpgm +; +; GFX7LESS_DPP-LABEL: add_i64_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_mov_b32 s7, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s6, -1 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 s10, s6 +; GFX7LESS_DPP-NEXT: s_mov_b32 s11, s7 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s8, s2 +; GFX7LESS_DPP-NEXT: s_mov_b32 s9, s3 +; GFX7LESS_DPP-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc +; GFX7LESS_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS_DPP-NEXT: buffer_wbinvl1 +; GFX7LESS_DPP-NEXT: s_mov_b32 s4, s0 +; GFX7LESS_DPP-NEXT: s_mov_b32 s5, s1 +; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX7LESS_DPP-NEXT: s_endpgm +; +; GFX8_DPP-LABEL: add_i64_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s6, v3, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s7 +; GFX8_DPP-NEXT: s_mov_b32 s11, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s10, -1 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: s_mov_b32 s8, s2 +; GFX8_DPP-NEXT: s_mov_b32 s9, s3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s6 +; GFX8_DPP-NEXT: buffer_atomic_add_x2 v[7:8], off, s[8:11], 0 glc +; GFX8_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX8_DPP-NEXT: buffer_wbinvl1_vol +; GFX8_DPP-NEXT: .LBB5_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_DPP-NEXT: v_add_u32_e32 v7, vcc, s5, v7 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v8, vcc, v0, v8, vcc +; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: add_i64_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s6, v3, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s7 +; GFX9_DPP-NEXT: s_mov_b32 s11, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s10, -1 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: s_mov_b32 s8, s2 +; GFX9_DPP-NEXT: s_mov_b32 s9, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s6 +; GFX9_DPP-NEXT: buffer_atomic_add_x2 v[7:8], off, s[8:11], 0 glc +; GFX9_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9_DPP-NEXT: buffer_wbinvl1_vol +; GFX9_DPP-NEXT: .LBB5_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v7, vcc, s5, v7 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v8, vcc, v0, v8, vcc +; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: add_i64_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v3, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s10, v3, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v3, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s11, v4, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX1064_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s11, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s10, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[8:9] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1064_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1064_DPP-NEXT: buffer_atomic_add_x2 v[9:10], off, s[4:7], 0 glc +; GFX1064_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl1_inv +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB5_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s2, v9 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v10 +; GFX1064_DPP-NEXT: v_add_co_u32 v9, vcc, s2, v11 +; GFX1064_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc, s3, v12, vcc +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: add_i64_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s6, -1 +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s8, v4, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s6, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v2, s8, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1032_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1032_DPP-NEXT: buffer_atomic_add_x2 v[9:10], off, s[4:7], 0 glc +; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl1_inv +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB5_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s2, v9 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v10 +; GFX1032_DPP-NEXT: v_add_co_u32 v9, vcc_lo, s2, v11 +; GFX1032_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s3, v12, vcc_lo +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: add_i64_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s5 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s10, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s11, v3, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s9, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s10, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s11, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[8:9] +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[8:9], exec +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s4 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1164_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1164_DPP-NEXT: buffer_atomic_add_u64 v[8:9], off, s[4:7], 0 glc +; GFX1164_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl1_inv +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB5_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s2, v8 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, v2 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v9 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_u32 v8, vcc, s2, v10 +; GFX1164_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc, s3, v11, vcc +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: add_i64_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s7, v4, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s8, v3, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v3, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v2, s8, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s8, exec_lo +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1132_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1132_DPP-NEXT: buffer_atomic_add_u64 v[8:9], off, s[4:7], 0 glc +; GFX1132_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl1_inv +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB5_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s2, v8 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v2 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v9 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s2, v10 +; GFX1132_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm +; +; GFX1264_DPP-LABEL: add_i64_varying: +; GFX1264_DPP: ; %bb.0: ; %entry +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX1264_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1264_DPP-NEXT: s_not_b64 exec, exec +; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1264_DPP-NEXT: s_not_b64 exec, exec +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1264_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1264_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1264_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc +; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, s5 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, s4 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_readlane_b32 s6, v4, 15 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1264_DPP-NEXT: v_readlane_b32 s8, v4, 31 +; GFX1264_DPP-NEXT: v_readlane_b32 s9, v3, 31 +; GFX1264_DPP-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1264_DPP-NEXT: v_readlane_b32 s6, v4, 63 +; GFX1264_DPP-NEXT: v_writelane_b32 v2, s7, 16 +; GFX1264_DPP-NEXT: v_readlane_b32 s10, v4, 47 +; GFX1264_DPP-NEXT: v_readlane_b32 s11, v3, 47 +; GFX1264_DPP-NEXT: v_readlane_b32 s7, v3, 63 +; GFX1264_DPP-NEXT: v_writelane_b32 v1, s8, 32 +; GFX1264_DPP-NEXT: v_writelane_b32 v2, s9, 32 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX1264_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX1264_DPP-NEXT: v_writelane_b32 v1, s10, 48 +; GFX1264_DPP-NEXT: v_writelane_b32 v2, s11, 48 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[8:9] +; GFX1264_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1264_DPP-NEXT: s_mov_b64 s[8:9], exec +; GFX1264_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX1264_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1264_DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX1264_DPP-NEXT: ; %bb.1: +; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, s5 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v8, s4 +; GFX1264_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 +; GFX1264_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1264_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1264_DPP-NEXT: buffer_atomic_add_u64 v[8:9], off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0 +; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV +; GFX1264_DPP-NEXT: .LBB5_2: +; GFX1264_DPP-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 +; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v8 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v11, v2 +; GFX1264_DPP-NEXT: v_readfirstlane_b32 s3, v9 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_add_co_u32 v8, vcc, s2, v10 +; GFX1264_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc, s3, v11, vcc +; GFX1264_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], null +; GFX1264_DPP-NEXT: s_nop 0 +; GFX1264_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1264_DPP-NEXT: s_endpgm +; +; GFX1232_DPP-LABEL: add_i64_varying: +; GFX1232_DPP: ; %bb.0: ; %entry +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1232_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v2 +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo +; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1232_DPP-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1232_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1232_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo +; GFX1232_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_readlane_b32 s7, v4, 15 +; GFX1232_DPP-NEXT: v_readlane_b32 s8, v3, 15 +; GFX1232_DPP-NEXT: v_readlane_b32 s5, v3, 31 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6 +; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1 +; GFX1232_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1232_DPP-NEXT: v_writelane_b32 v2, s8, 16 +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6 +; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1232_DPP-NEXT: s_mov_b32 s8, exec_lo +; GFX1232_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX1232_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1232_DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX1232_DPP-NEXT: ; %bb.1: +; GFX1232_DPP-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4 +; GFX1232_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 +; GFX1232_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1232_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1232_DPP-NEXT: buffer_atomic_add_u64 v[8:9], off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX1232_DPP-NEXT: s_wait_loadcnt 0x0 +; GFX1232_DPP-NEXT: global_inv scope:SCOPE_DEV +; GFX1232_DPP-NEXT: .LBB5_2: +; GFX1232_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 +; GFX1232_DPP-NEXT: v_readfirstlane_b32 s2, v8 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v11, v2 +; GFX1232_DPP-NEXT: v_readfirstlane_b32 s3, v9 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s2, v10 +; GFX1232_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo +; GFX1232_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], null +; GFX1232_DPP-NEXT: s_nop 0 +; GFX1232_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1232_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %zext = zext i32 %lane to i64 @@ -2377,579 +4214,1126 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 -; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032-NEXT: s_mov_b32 s6, -1 -; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: sub_i32_uniform: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1164-NEXT: s_load_b32 s2, s[2:3], 0x34 -; GFX1164-NEXT: s_mov_b64 s[8:9], exec -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1164-NEXT: ; implicit-def: $vgpr1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB7_2 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s3, s[8:9] -; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mul_i32 s3, s2, s3 -; GFX1164-NEXT: s_mov_b32 s10, -1 -; GFX1164-NEXT: v_mov_b32_e32 v1, s3 -; GFX1164-NEXT: s_mov_b32 s8, s6 -; GFX1164-NEXT: s_mov_b32 s9, s7 -; GFX1164-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: buffer_gl1_inv -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB7_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mul_lo_u32 v0, s2, v0 -; GFX1164-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1164-NEXT: s_mov_b32 s6, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: sub_i32_uniform: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX1132-NEXT: s_mov_b32 s8, exec_lo -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB7_2 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s8 -; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mul_i32 s2, s0, s2 -; GFX1132-NEXT: s_mov_b32 s10, -1 -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 -; GFX1132-NEXT: s_mov_b32 s8, s6 -; GFX1132-NEXT: s_mov_b32 s9, s7 -; GFX1132-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: buffer_gl1_inv -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB7_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mul_lo_u32 v0, s0, v0 -; GFX1132-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1132-NEXT: s_mov_b32 s6, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1132-NEXT: s_endpgm -; -; GFX1264-LABEL: sub_i32_uniform: -; GFX1264: ; %bb.0: ; %entry -; GFX1264-NEXT: s_clause 0x1 -; GFX1264-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1264-NEXT: s_load_b32 s2, s[2:3], 0x34 -; GFX1264-NEXT: s_mov_b64 s[8:9], exec -; GFX1264-NEXT: s_mov_b64 s[0:1], exec -; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1264-NEXT: ; implicit-def: $vgpr1 -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 -; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1264-NEXT: s_cbranch_execz .LBB7_2 -; GFX1264-NEXT: ; %bb.1: -; GFX1264-NEXT: s_bcnt1_i32_b64 s3, s[8:9] -; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_mul_i32 s3, s2, s3 -; GFX1264-NEXT: s_mov_b32 s10, -1 -; GFX1264-NEXT: v_mov_b32_e32 v1, s3 -; GFX1264-NEXT: s_mov_b32 s8, s6 -; GFX1264-NEXT: s_mov_b32 s9, s7 -; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX1264-NEXT: s_wait_loadcnt 0x0 -; GFX1264-NEXT: global_inv scope:SCOPE_DEV -; GFX1264-NEXT: .LBB7_2: -; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: v_mul_lo_u32 v0, s2, v0 -; GFX1264-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1264-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1264-NEXT: s_mov_b32 s6, -1 -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX1264-NEXT: buffer_store_b32 v0, off, s[4:7], null -; GFX1264-NEXT: s_nop 0 -; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1264-NEXT: s_endpgm -; -; GFX1232-LABEL: sub_i32_uniform: -; GFX1232: ; %bb.0: ; %entry -; GFX1232-NEXT: s_clause 0x1 -; GFX1232-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1232-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX1232-NEXT: s_mov_b32 s8, exec_lo -; GFX1232-NEXT: s_mov_b32 s1, exec_lo -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1232-NEXT: ; implicit-def: $vgpr1 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1232-NEXT: s_cbranch_execz .LBB7_2 -; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s8 -; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: s_mul_i32 s2, s0, s2 -; GFX1232-NEXT: s_mov_b32 s10, -1 -; GFX1232-NEXT: v_mov_b32_e32 v1, s2 -; GFX1232-NEXT: s_mov_b32 s8, s6 -; GFX1232-NEXT: s_mov_b32 s9, s7 -; GFX1232-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX1232-NEXT: s_wait_loadcnt 0x0 -; GFX1232-NEXT: global_inv scope:SCOPE_DEV -; GFX1232-NEXT: .LBB7_2: -; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: v_mul_lo_u32 v0, s0, v0 -; GFX1232-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1232-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1232-NEXT: s_mov_b32 s6, -1 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX1232-NEXT: buffer_store_b32 v0, off, s[4:7], null -; GFX1232-NEXT: s_nop 0 -; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1232-NEXT: s_endpgm -entry: - %old = atomicrmw sub ptr addrspace(1) %inout, i32 %subitive syncscope("agent") acq_rel - store i32 %old, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) { -; GFX7LESS-LABEL: sub_i32_varying: -; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s6, -1 -; GFX7LESS-NEXT: s_mov_b32 s10, s6 -; GFX7LESS-NEXT: s_mov_b32 s11, s7 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s8, s2 -; GFX7LESS-NEXT: s_mov_b32 s9, s3 -; GFX7LESS-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 -; GFX7LESS-NEXT: s_mov_b32 s4, s0 -; GFX7LESS-NEXT: s_mov_b32 s5, s1 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: sub_i32_varying: -; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: s_mov_b32 s6, 0 -; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: .LBB8_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX8-NEXT: s_mov_b32 m0, s4 -; GFX8-NEXT: v_readlane_b32 s7, v0, s4 -; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX8-NEXT: v_writelane_b32 v1, s6, m0 -; GFX8-NEXT: s_add_i32 s6, s6, s7 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX8-NEXT: s_cbranch_scc1 .LBB8_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX8-NEXT: s_cbranch_execz .LBB8_4 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_mov_b32 s11, 0xf000 -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s8, s2 -; GFX8-NEXT: s_mov_b32 s9, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: .LBB8_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v1 -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: sub_i32_varying: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s6, 0 -; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: .LBB8_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX9-NEXT: s_mov_b32 m0, s4 -; GFX9-NEXT: v_readlane_b32 s7, v0, s4 -; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX9-NEXT: v_writelane_b32 v1, s6, m0 -; GFX9-NEXT: s_add_i32 s6, s6, s7 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_cbranch_scc1 .LBB8_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_4 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_mov_b32 s9, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: .LBB8_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: sub_i32_varying: -; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s6, 0 -; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s7, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s8, v0, s7 -; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s7 -; GFX1064-NEXT: v_writelane_b32 v1, s6, s7 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX1064-NEXT: s_add_i32 s6, s6, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1064-NEXT: s_cbranch_execz .LBB8_4 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: v_mov_b32_e32 v0, s6 -; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1064-NEXT: s_mov_b32 s10, -1 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s8, s2 -; GFX1064-NEXT: s_mov_b32 s9, s3 -; GFX1064-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: buffer_gl1_inv -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB8_4: -; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v1 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: sub_i32_varying: -; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: .LBB8_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032-NEXT: v_readlane_b32 s5, v0, s1 -; GFX1032-NEXT: s_lshl_b32 s6, 1, s1 -; GFX1032-NEXT: v_writelane_b32 v1, s4, s1 -; GFX1032-NEXT: s_andn2_b32 s0, s0, s6 -; GFX1032-NEXT: s_add_i32 s4, s4, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1032-NEXT: s_xor_b32 s5, exec_lo, s5 -; GFX1032-NEXT: s_cbranch_execz .LBB8_4 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: v_mov_b32_e32 v0, s4 -; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s8, s2 -; GFX1032-NEXT: s_mov_b32 s9, s3 -; GFX1032-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: buffer_gl1_inv -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB8_4: -; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v1 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s6, -1 +; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX1032-NEXT: s_endpgm ; -; GFX1164-LABEL: sub_i32_varying: +; GFX1164-LABEL: sub_i32_uniform: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b32 s2, s[2:3], 0x34 +; GFX1164-NEXT: s_mov_b64 s[8:9], exec ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_mov_b32 s6, 0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: .LBB8_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s7, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s8, v1, s7 -; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s7 -; GFX1164-NEXT: v_writelane_b32 v0, s6, s7 -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-NEXT: s_add_i32 s6, s6, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1164-NEXT: s_cbranch_execz .LBB8_4 -; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v1, s6 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB7_2 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_bcnt1_i32_b64 s3, s[8:9] ; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1164-NEXT: s_mov_b32 s10, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mov_b32 s8, s2 -; GFX1164-NEXT: s_mov_b32 s9, s3 +; GFX1164-NEXT: s_mul_i32 s3, s2, s3 +; GFX1164-NEXT: s_mov_b32 s10, -1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s3 +; GFX1164-NEXT: s_mov_b32 s8, s6 +; GFX1164-NEXT: s_mov_b32 s9, s7 ; GFX1164-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB8_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB7_2: +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164-NEXT: s_mov_b32 s6, -1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; -; GFX1132-LABEL: sub_i32_varying: +; GFX1132-LABEL: sub_i32_uniform: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: .LBB8_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s5, v1, s1 -; GFX1132-NEXT: s_lshl_b32 s6, 1, s1 -; GFX1132-NEXT: v_writelane_b32 v0, s4, s1 -; GFX1132-NEXT: s_and_not1_b32 s0, s0, s6 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: s_add_i32 s4, s4, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX1132-NEXT: s_mov_b32 s8, exec_lo +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1132-NEXT: s_xor_b32 s5, exec_lo, s5 -; GFX1132-NEXT: s_cbranch_execz .LBB8_4 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_mov_b32_e32 v1, s4 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB7_2 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s8 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1132-NEXT: s_mov_b32 s10, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mov_b32 s8, s2 -; GFX1132-NEXT: s_mov_b32 s9, s3 +; GFX1132-NEXT: s_mul_i32 s2, s0, s2 +; GFX1132-NEXT: s_mov_b32 s10, -1 +; GFX1132-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-NEXT: s_mov_b32 s8, s6 +; GFX1132-NEXT: s_mov_b32 s9, s7 ; GFX1132-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB8_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1132-NEXT: .LBB7_2: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s6, -1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; -; GFX1264-LABEL: sub_i32_varying: +; GFX1264-LABEL: sub_i32_uniform: ; GFX1264: ; %bb.0: ; %entry -; GFX1264-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1264-NEXT: s_clause 0x1 +; GFX1264-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1264-NEXT: s_load_b32 s2, s[2:3], 0x34 +; GFX1264-NEXT: s_mov_b64 s[8:9], exec ; GFX1264-NEXT: s_mov_b64 s[0:1], exec -; GFX1264-NEXT: s_mov_b32 s6, 0 -; GFX1264-NEXT: ; implicit-def: $vgpr0 -; GFX1264-NEXT: .LBB8_1: ; %ComputeLoop -; GFX1264-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264-NEXT: s_ctz_i32_b64 s7, s[0:1] -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1264-NEXT: v_readlane_b32 s8, v1, s7 -; GFX1264-NEXT: s_lshl_b64 s[4:5], 1, s7 -; GFX1264-NEXT: v_writelane_b32 v0, s6, s7 -; GFX1264-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1264-NEXT: s_add_co_i32 s6, s6, s8 -; GFX1264-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1264-NEXT: s_cbranch_scc1 .LBB8_1 -; GFX1264-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1264-NEXT: ; implicit-def: $vgpr1 -; GFX1264-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1264-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1264-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1264-NEXT: s_cbranch_execz .LBB8_4 -; GFX1264-NEXT: ; %bb.3: -; GFX1264-NEXT: v_mov_b32_e32 v1, s6 +; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1264-NEXT: s_cbranch_execz .LBB7_2 +; GFX1264-NEXT: ; %bb.1: +; GFX1264-NEXT: s_bcnt1_i32_b64 s3, s[8:9] ; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1264-NEXT: s_mov_b32 s10, -1 ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_mov_b32 s8, s2 -; GFX1264-NEXT: s_mov_b32 s9, s3 +; GFX1264-NEXT: s_mul_i32 s3, s2, s3 +; GFX1264-NEXT: s_mov_b32 s10, -1 +; GFX1264-NEXT: v_mov_b32_e32 v1, s3 +; GFX1264-NEXT: s_mov_b32 s8, s6 +; GFX1264-NEXT: s_mov_b32 s9, s7 ; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV -; GFX1264-NEXT: .LBB8_4: -; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1264-NEXT: .LBB7_2: +; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX1264-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1264-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1264-NEXT: s_mov_b32 s6, -1 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX1264-NEXT: s_mov_b32 s2, -1 -; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null +; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1264-NEXT: buffer_store_b32 v0, off, s[4:7], null ; GFX1264-NEXT: s_nop 0 ; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1264-NEXT: s_endpgm ; -; GFX1232-LABEL: sub_i32_varying: +; GFX1232-LABEL: sub_i32_uniform: ; GFX1232: ; %bb.0: ; %entry -; GFX1232-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1232-NEXT: s_mov_b32 s0, exec_lo -; GFX1232-NEXT: s_mov_b32 s4, 0 -; GFX1232-NEXT: ; implicit-def: $vgpr0 -; GFX1232-NEXT: .LBB8_1: ; %ComputeLoop -; GFX1232-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1232-NEXT: v_readlane_b32 s5, v1, s1 -; GFX1232-NEXT: s_lshl_b32 s6, 1, s1 -; GFX1232-NEXT: v_writelane_b32 v0, s4, s1 -; GFX1232-NEXT: s_and_not1_b32 s0, s0, s6 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1232-NEXT: s_add_co_i32 s4, s4, s5 -; GFX1232-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1232-NEXT: s_cbranch_scc1 .LBB8_1 -; GFX1232-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1232-NEXT: s_clause 0x1 +; GFX1232-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1232-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX1232-NEXT: s_mov_b32 s8, exec_lo +; GFX1232-NEXT: s_mov_b32 s1, exec_lo +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1232-NEXT: ; implicit-def: $vgpr1 -; GFX1232-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1232-NEXT: s_xor_b32 s5, exec_lo, s5 -; GFX1232-NEXT: s_cbranch_execz .LBB8_4 -; GFX1232-NEXT: ; %bb.3: -; GFX1232-NEXT: v_mov_b32_e32 v1, s4 +; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1232-NEXT: s_cbranch_execz .LBB7_2 +; GFX1232-NEXT: ; %bb.1: +; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s8 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1232-NEXT: s_mov_b32 s10, -1 ; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: s_mov_b32 s8, s2 -; GFX1232-NEXT: s_mov_b32 s9, s3 +; GFX1232-NEXT: s_mul_i32 s2, s0, s2 +; GFX1232-NEXT: s_mov_b32 s10, -1 +; GFX1232-NEXT: v_mov_b32_e32 v1, s2 +; GFX1232-NEXT: s_mov_b32 s8, s6 +; GFX1232-NEXT: s_mov_b32 s9, s7 ; GFX1232-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV -; GFX1232-NEXT: .LBB8_4: -; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1232-NEXT: .LBB7_2: +; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX1232-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1232-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1232-NEXT: s_mov_b32 s6, -1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX1232-NEXT: s_mov_b32 s2, -1 -; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null +; GFX1232-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1232-NEXT: buffer_store_b32 v0, off, s[4:7], null ; GFX1232-NEXT: s_nop 0 ; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1232-NEXT: s_endpgm +entry: + %old = atomicrmw sub ptr addrspace(1) %inout, i32 %subitive syncscope("agent") acq_rel + store i32 %old, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) { +; GFX7LESS_ITERATIVE-LABEL: sub_i32_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s6, 0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s4 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s4 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB8_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS_ITERATIVE-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_wbinvl1 +; GFX7LESS_ITERATIVE-NEXT: .LBB8_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt expcnt(0) +; GFX7LESS_ITERATIVE-NEXT: v_sub_i32_e32 v0, vcc, s4, v1 +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm +; +; GFX8_ITERATIVE-LABEL: sub_i32_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_mov_b32 s6, 0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX8_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s4 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s4 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 +; GFX8_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 +; GFX8_ITERATIVE-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX8_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_wbinvl1_vol +; GFX8_ITERATIVE-NEXT: .LBB8_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_sub_u32_e32 v0, vcc, s4, v1 +; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm +; +; GFX9_ITERATIVE-LABEL: sub_i32_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b32 s6, 0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX9_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s4 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s4 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 +; GFX9_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 +; GFX9_ITERATIVE-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX9_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_wbinvl1_vol +; GFX9_ITERATIVE-NEXT: .LBB8_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_sub_u32_e32 v0, s4, v1 +; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm +; +; GFX1064_ITERATIVE-LABEL: sub_i32_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s6, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1064_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s7, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s7 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s7 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX1064_ITERATIVE-NEXT: s_add_i32 s6, s6, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1064_ITERATIVE-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX1064_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl1_inv +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB8_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm +; +; GFX1032_ITERATIVE-LABEL: sub_i32_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1032_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s1 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s1 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s1 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s6 +; GFX1032_ITERATIVE-NEXT: s_add_i32 s4, s4, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1032_ITERATIVE-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX1032_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl1_inv +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB8_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm +; +; GFX1164_ITERATIVE-LABEL: sub_i32_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s6, 0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s7 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1164_ITERATIVE-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc +; GFX1164_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB8_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm +; +; GFX1132_ITERATIVE-LABEL: sub_i32_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s1 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s1 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s6 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: s_add_i32 s4, s4, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v1, s4 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1132_ITERATIVE-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc +; GFX1132_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl1_inv +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB8_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm +; +; GFX1264_ITERATIVE-LABEL: sub_i32_varying: +; GFX1264_ITERATIVE: ; %bb.0: ; %entry +; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s6, 0 +; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1264_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop +; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 +; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s7 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 +; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8 +; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1264_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1264_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 +; GFX1264_ITERATIVE-NEXT: ; %bb.3: +; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1264_ITERATIVE-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1264_ITERATIVE-NEXT: s_wait_loadcnt 0x0 +; GFX1264_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV +; GFX1264_ITERATIVE-NEXT: .LBB8_4: +; GFX1264_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 +; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1264_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null +; GFX1264_ITERATIVE-NEXT: s_nop 0 +; GFX1264_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1264_ITERATIVE-NEXT: s_endpgm +; +; GFX1232_ITERATIVE-LABEL: sub_i32_varying: +; GFX1232_ITERATIVE: ; %bb.0: ; %entry +; GFX1232_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1232_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop +; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s1 +; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s1 +; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1 +; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s6 +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s4, s4, s5 +; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1232_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1232_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1232_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5 +; GFX1232_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 +; GFX1232_ITERATIVE-NEXT: ; %bb.3: +; GFX1232_ITERATIVE-NEXT: v_mov_b32_e32 v1, s4 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1232_ITERATIVE-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0 +; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV +; GFX1232_ITERATIVE-NEXT: .LBB8_4: +; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 +; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1232_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null +; GFX1232_ITERATIVE-NEXT: s_nop 0 +; GFX1232_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1232_ITERATIVE-NEXT: s_endpgm +; +; GFX7LESS_DPP-LABEL: sub_i32_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_mov_b32 s7, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s6, -1 +; GFX7LESS_DPP-NEXT: s_mov_b32 s10, s6 +; GFX7LESS_DPP-NEXT: s_mov_b32 s11, s7 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s8, s2 +; GFX7LESS_DPP-NEXT: s_mov_b32 s9, s3 +; GFX7LESS_DPP-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX7LESS_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS_DPP-NEXT: buffer_wbinvl1 +; GFX7LESS_DPP-NEXT: s_mov_b32 s4, s0 +; GFX7LESS_DPP-NEXT: s_mov_b32 s5, s1 +; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7LESS_DPP-NEXT: s_endpgm +; +; GFX8_DPP-LABEL: sub_i32_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s6, v2, 63 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB8_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: s_mov_b32 s11, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s10, -1 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: s_mov_b32 s8, s2 +; GFX8_DPP-NEXT: s_mov_b32 s9, s3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX8_DPP-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX8_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX8_DPP-NEXT: buffer_wbinvl1_vol +; GFX8_DPP-NEXT: .LBB8_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 +; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: sub_i32_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s6, v2, 63 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB8_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: s_mov_b32 s11, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s10, -1 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: s_mov_b32 s8, s2 +; GFX9_DPP-NEXT: s_mov_b32 s9, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX9_DPP-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX9_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9_DPP-NEXT: buffer_wbinvl1_vol +; GFX9_DPP-NEXT: .LBB8_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: sub_i32_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1064_DPP-NEXT: s_mov_b32 s4, s9 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s8, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB8_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1064_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1064_DPP-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc +; GFX1064_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl1_inv +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB8_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_DPP-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: sub_i32_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s5, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s4, s6 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB8_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1032_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1032_DPP-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc +; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl1_inv +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB8_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_DPP-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: sub_i32_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1164_DPP-NEXT: s_mov_b32 s4, s9 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s8, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB8_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1164_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1164_DPP-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc +; GFX1164_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl1_inv +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB8_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: sub_i32_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s4, s6 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB8_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1132_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1132_DPP-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc +; GFX1132_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl1_inv +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB8_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm +; +; GFX1264_DPP-LABEL: sub_i32_varying: +; GFX1264_DPP: ; %bb.0: ; %entry +; GFX1264_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1264_DPP-NEXT: s_not_b64 exec, exec +; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1264_DPP-NEXT: s_not_b64 exec, exec +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1264_DPP-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1264_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1264_DPP-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1264_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1264_DPP-NEXT: v_readlane_b32 s9, v1, 63 +; GFX1264_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1264_DPP-NEXT: s_mov_b32 s4, s9 +; GFX1264_DPP-NEXT: v_writelane_b32 v3, s8, 48 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1264_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1264_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1264_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1264_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX1264_DPP-NEXT: s_cbranch_execz .LBB8_2 +; GFX1264_DPP-NEXT: ; %bb.1: +; GFX1264_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1264_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 +; GFX1264_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1264_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1264_DPP-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0 +; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV +; GFX1264_DPP-NEXT: .LBB8_2: +; GFX1264_DPP-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 +; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1264_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1264_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1264_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], null +; GFX1264_DPP-NEXT: s_nop 0 +; GFX1264_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1264_DPP-NEXT: s_endpgm +; +; GFX1232_DPP-LABEL: sub_i32_varying: +; GFX1232_DPP: ; %bb.0: ; %entry +; GFX1232_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16 +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1232_DPP-NEXT: s_mov_b32 s4, s6 +; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1232_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1232_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo +; GFX1232_DPP-NEXT: s_cbranch_execz .LBB8_2 +; GFX1232_DPP-NEXT: ; %bb.1: +; GFX1232_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1232_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 +; GFX1232_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1232_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1232_DPP-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX1232_DPP-NEXT: s_wait_loadcnt 0x0 +; GFX1232_DPP-NEXT: global_inv scope:SCOPE_DEV +; GFX1232_DPP-NEXT: .LBB8_2: +; GFX1232_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 +; GFX1232_DPP-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1232_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1232_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1232_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], null +; GFX1232_DPP-NEXT: s_nop 0 +; GFX1232_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1232_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = atomicrmw sub ptr addrspace(1) %inout, i32 %lane syncscope("agent") acq_rel @@ -3731,151 +6115,1363 @@ entry: } define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) { -; GFX7LESS-LABEL: sub_i64_varying: -; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s6, -1 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: s_mov_b32 s10, s6 -; GFX7LESS-NEXT: s_mov_b32 s11, s7 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s8, s2 -; GFX7LESS-NEXT: s_mov_b32 s9, s3 -; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 -; GFX7LESS-NEXT: s_mov_b32 s4, s0 -; GFX7LESS-NEXT: s_mov_b32 s5, s1 -; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX7LESS-NEXT: s_endpgm +; GFX7LESS_ITERATIVE-LABEL: sub_i64_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s6 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s5, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX7LESS_ITERATIVE-NEXT: s_add_u32 s4, s4, s8 +; GFX7LESS_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB11_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5 +; GFX7LESS_ITERATIVE-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_wbinvl1 +; GFX7LESS_ITERATIVE-NEXT: .LBB11_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt expcnt(0) +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX7LESS_ITERATIVE-NEXT: v_sub_i32_e32 v0, vcc, s5, v1 +; GFX7LESS_ITERATIVE-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm ; -; GFX89-LABEL: sub_i64_varying: -; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: s_mov_b32 s10, s6 -; GFX89-NEXT: s_mov_b32 s11, s7 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s8, s2 -; GFX89-NEXT: s_mov_b32 s9, s3 -; GFX89-NEXT: v_mov_b32_e32 v1, 0 -; GFX89-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: buffer_wbinvl1_vol -; GFX89-NEXT: s_mov_b32 s4, s0 -; GFX89-NEXT: s_mov_b32 s5, s1 -; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX89-NEXT: s_endpgm -; -; GFX10-LABEL: sub_i64_varying: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 -; GFX10-NEXT: s_mov_b32 s11, s7 -; GFX10-NEXT: s_mov_b32 s10, s6 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s8, s2 -; GFX10-NEXT: s_mov_b32 s9, s3 -; GFX10-NEXT: s_mov_b32 s4, s0 -; GFX10-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_mov_b32 s5, s1 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX10-NEXT: s_endpgm -; -; GFX1164-LABEL: sub_i64_varying: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1164-NEXT: s_mov_b32 s6, -1 -; GFX1164-NEXT: s_mov_b32 s11, s7 -; GFX1164-NEXT: s_mov_b32 s10, s6 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mov_b32 s8, s2 -; GFX1164-NEXT: s_mov_b32 s9, s3 -; GFX1164-NEXT: s_mov_b32 s4, s0 -; GFX1164-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: buffer_gl1_inv -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: s_mov_b32 s5, s1 -; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1164-NEXT: s_endpgm +; GFX8_ITERATIVE-LABEL: sub_i64_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s6 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8_ITERATIVE-NEXT: s_add_u32 s4, s4, s8 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s5, m0 +; GFX8_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5 +; GFX8_ITERATIVE-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc +; GFX8_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_wbinvl1_vol +; GFX8_ITERATIVE-NEXT: .LBB11_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX8_ITERATIVE-NEXT: v_sub_u32_e32 v0, vcc, s5, v1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc +; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm ; -; GFX1132-LABEL: sub_i64_varying: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1132-NEXT: s_mov_b32 s6, -1 -; GFX1132-NEXT: s_mov_b32 s11, s7 -; GFX1132-NEXT: s_mov_b32 s10, s6 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mov_b32 s8, s2 -; GFX1132-NEXT: s_mov_b32 s9, s3 -; GFX1132-NEXT: s_mov_b32 s4, s0 -; GFX1132-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: buffer_gl1_inv -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: s_mov_b32 s5, s1 -; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1132-NEXT: s_endpgm +; GFX9_ITERATIVE-LABEL: sub_i64_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s6 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9_ITERATIVE-NEXT: s_add_u32 s4, s4, s8 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s5, m0 +; GFX9_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5 +; GFX9_ITERATIVE-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc +; GFX9_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_wbinvl1_vol +; GFX9_ITERATIVE-NEXT: .LBB11_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX9_ITERATIVE-NEXT: v_sub_co_u32_e32 v0, vcc, s5, v1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v2, vcc +; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm ; -; GFX1264-LABEL: sub_i64_varying: -; GFX1264: ; %bb.0: ; %entry -; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1264-NEXT: v_mov_b32_e32 v1, 0 -; GFX1264-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1264-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1264-NEXT: s_mov_b32 s6, -1 -; GFX1264-NEXT: s_mov_b32 s11, s7 -; GFX1264-NEXT: s_mov_b32 s10, s6 -; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_mov_b32 s8, s2 -; GFX1264-NEXT: s_mov_b32 s9, s3 -; GFX1264-NEXT: s_mov_b32 s4, s0 -; GFX1264-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX1264-NEXT: s_wait_loadcnt 0x0 -; GFX1264-NEXT: global_inv scope:SCOPE_DEV -; GFX1264-NEXT: s_mov_b32 s5, s1 -; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null -; GFX1264-NEXT: s_nop 0 -; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1264-NEXT: s_endpgm +; GFX1064_ITERATIVE-LABEL: sub_i64_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s6 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s6 +; GFX1064_ITERATIVE-NEXT: s_add_u32 s4, s4, s7 +; GFX1064_ITERATIVE-NEXT: s_addc_u32 s5, s5, s8 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1064_ITERATIVE-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc +; GFX1064_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl1_inv +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB11_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1064_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v1 +; GFX1064_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm ; -; GFX1232-LABEL: sub_i64_varying: -; GFX1232: ; %bb.0: ; %entry -; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1232-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX1232-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1232-NEXT: s_mov_b32 s6, -1 -; GFX1232-NEXT: s_mov_b32 s11, s7 -; GFX1232-NEXT: s_mov_b32 s10, s6 -; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: s_mov_b32 s8, s2 -; GFX1232-NEXT: s_mov_b32 s9, s3 -; GFX1232-NEXT: s_mov_b32 s4, s0 -; GFX1232-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX1232-NEXT: s_wait_loadcnt 0x0 -; GFX1232-NEXT: global_inv scope:SCOPE_DEV -; GFX1232-NEXT: s_mov_b32 s5, s1 -; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null -; GFX1232-NEXT: s_nop 0 -; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1232-NEXT: s_endpgm +; GFX1032_ITERATIVE-LABEL: sub_i64_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s1 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s1 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s1 +; GFX1032_ITERATIVE-NEXT: s_add_u32 s4, s4, s6 +; GFX1032_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s6, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s6, exec_lo, s6 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1032_ITERATIVE-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc +; GFX1032_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl1_inv +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB11_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1032_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1 +; GFX1032_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm +; +; GFX1164_ITERATIVE-LABEL: sub_i64_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s6 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s5, s6 +; GFX1164_ITERATIVE-NEXT: s_add_u32 s4, s4, s7 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_addc_u32 s5, s5, s8 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s5 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1164_ITERATIVE-NEXT: buffer_atomic_sub_u64 v[2:3], off, s[8:11], 0 glc +; GFX1164_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB11_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v0 +; GFX1164_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm +; +; GFX1132_ITERATIVE-LABEL: sub_i64_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s1 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s1 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s5, s1 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_ITERATIVE-NEXT: s_add_u32 s4, s4, s6 +; GFX1132_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s6, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s6, exec_lo, s6 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1132_ITERATIVE-NEXT: buffer_atomic_sub_u64 v[2:3], off, s[8:11], 0 glc +; GFX1132_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl1_inv +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB11_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 +; GFX1132_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm +; +; GFX1264_ITERATIVE-LABEL: sub_i64_varying: +; GFX1264_ITERATIVE: ; %bb.0: ; %entry +; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1264_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1] +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s5, s10 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s10 +; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[8:9] +; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7] +; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1264_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1264_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX1264_ITERATIVE-NEXT: ; %bb.3: +; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, s5 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1264_ITERATIVE-NEXT: buffer_atomic_sub_u64 v[2:3], off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1264_ITERATIVE-NEXT: s_wait_loadcnt 0x0 +; GFX1264_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV +; GFX1264_ITERATIVE-NEXT: .LBB11_4: +; GFX1264_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 +; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v0 +; GFX1264_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1264_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null +; GFX1264_ITERATIVE-NEXT: s_nop 0 +; GFX1264_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1264_ITERATIVE-NEXT: s_endpgm +; +; GFX1232_ITERATIVE-LABEL: sub_i64_varying: +; GFX1232_ITERATIVE: ; %bb.0: ; %entry +; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1232_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1232_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s1 +; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s1 +; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1 +; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s5, s1 +; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1 +; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8 +; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7] +; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1232_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1232_ITERATIVE-NEXT: s_and_saveexec_b32 s6, vcc_lo +; GFX1232_ITERATIVE-NEXT: s_xor_b32 s6, exec_lo, s6 +; GFX1232_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX1232_ITERATIVE-NEXT: ; %bb.3: +; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1232_ITERATIVE-NEXT: buffer_atomic_sub_u64 v[2:3], off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0 +; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV +; GFX1232_ITERATIVE-NEXT: .LBB11_4: +; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 +; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1232_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 +; GFX1232_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1232_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null +; GFX1232_ITERATIVE-NEXT: s_nop 0 +; GFX1232_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1232_ITERATIVE-NEXT: s_endpgm +; +; GFX7LESS_DPP-LABEL: sub_i64_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_mov_b32 s7, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s6, -1 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 s10, s6 +; GFX7LESS_DPP-NEXT: s_mov_b32 s11, s7 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s8, s2 +; GFX7LESS_DPP-NEXT: s_mov_b32 s9, s3 +; GFX7LESS_DPP-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc +; GFX7LESS_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS_DPP-NEXT: buffer_wbinvl1 +; GFX7LESS_DPP-NEXT: s_mov_b32 s4, s0 +; GFX7LESS_DPP-NEXT: s_mov_b32 s5, s1 +; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX7LESS_DPP-NEXT: s_endpgm +; +; GFX8_DPP-LABEL: sub_i64_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s6, v3, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s7 +; GFX8_DPP-NEXT: s_mov_b32 s11, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s10, -1 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: s_mov_b32 s8, s2 +; GFX8_DPP-NEXT: s_mov_b32 s9, s3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s6 +; GFX8_DPP-NEXT: buffer_atomic_sub_x2 v[7:8], off, s[8:11], 0 glc +; GFX8_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX8_DPP-NEXT: buffer_wbinvl1_vol +; GFX8_DPP-NEXT: .LBB11_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_DPP-NEXT: v_sub_u32_e32 v7, vcc, s5, v7 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_subb_u32_e32 v8, vcc, v0, v8, vcc +; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: sub_i64_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s6, v3, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s7 +; GFX9_DPP-NEXT: s_mov_b32 s11, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s10, -1 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: s_mov_b32 s8, s2 +; GFX9_DPP-NEXT: s_mov_b32 s9, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s6 +; GFX9_DPP-NEXT: buffer_atomic_sub_x2 v[7:8], off, s[8:11], 0 glc +; GFX9_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9_DPP-NEXT: buffer_wbinvl1_vol +; GFX9_DPP-NEXT: .LBB11_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: v_sub_co_u32_e32 v7, vcc, s5, v7 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_subb_co_u32_e32 v8, vcc, v0, v8, vcc +; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: sub_i64_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v3, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s10, v3, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v3, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s11, v4, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX1064_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s11, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s10, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[8:9] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1064_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1064_DPP-NEXT: buffer_atomic_sub_x2 v[9:10], off, s[4:7], 0 glc +; GFX1064_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl1_inv +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB11_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s2, v9 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v10 +; GFX1064_DPP-NEXT: v_sub_co_u32 v9, vcc, s2, v11 +; GFX1064_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1064_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc, s3, v12, vcc +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: sub_i64_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s6, -1 +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s8, v4, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s6, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v2, s8, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1032_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1032_DPP-NEXT: buffer_atomic_sub_x2 v[9:10], off, s[4:7], 0 glc +; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl1_inv +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB11_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s2, v9 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v10 +; GFX1032_DPP-NEXT: v_sub_co_u32 v9, vcc_lo, s2, v11 +; GFX1032_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1032_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc_lo, s3, v12, vcc_lo +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: sub_i64_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s5 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s10, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s11, v3, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s9, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s10, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s11, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[8:9] +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[8:9], exec +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s4 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1164_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1164_DPP-NEXT: buffer_atomic_sub_u64 v[8:9], off, s[4:7], 0 glc +; GFX1164_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl1_inv +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB11_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s2, v8 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, v2 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v9 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_sub_co_u32 v8, vcc, s2, v10 +; GFX1164_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc, s3, v11, vcc +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: sub_i64_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s7, v4, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s8, v3, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v3, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v2, s8, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s8, exec_lo +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1132_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1132_DPP-NEXT: buffer_atomic_sub_u64 v[8:9], off, s[4:7], 0 glc +; GFX1132_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl1_inv +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB11_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s2, v8 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v2 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v9 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s2, v10 +; GFX1132_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1132_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm +; +; GFX1264_DPP-LABEL: sub_i64_varying: +; GFX1264_DPP: ; %bb.0: ; %entry +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX1264_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1264_DPP-NEXT: s_not_b64 exec, exec +; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1264_DPP-NEXT: s_not_b64 exec, exec +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1264_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1264_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1264_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc +; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, s5 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, s4 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_readlane_b32 s6, v4, 15 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1264_DPP-NEXT: v_readlane_b32 s8, v4, 31 +; GFX1264_DPP-NEXT: v_readlane_b32 s9, v3, 31 +; GFX1264_DPP-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1264_DPP-NEXT: v_readlane_b32 s6, v4, 63 +; GFX1264_DPP-NEXT: v_writelane_b32 v2, s7, 16 +; GFX1264_DPP-NEXT: v_readlane_b32 s10, v4, 47 +; GFX1264_DPP-NEXT: v_readlane_b32 s11, v3, 47 +; GFX1264_DPP-NEXT: v_readlane_b32 s7, v3, 63 +; GFX1264_DPP-NEXT: v_writelane_b32 v1, s8, 32 +; GFX1264_DPP-NEXT: v_writelane_b32 v2, s9, 32 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX1264_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX1264_DPP-NEXT: v_writelane_b32 v1, s10, 48 +; GFX1264_DPP-NEXT: v_writelane_b32 v2, s11, 48 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[8:9] +; GFX1264_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1264_DPP-NEXT: s_mov_b64 s[8:9], exec +; GFX1264_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX1264_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1264_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX1264_DPP-NEXT: ; %bb.1: +; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, s5 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v8, s4 +; GFX1264_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 +; GFX1264_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1264_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1264_DPP-NEXT: buffer_atomic_sub_u64 v[8:9], off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0 +; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV +; GFX1264_DPP-NEXT: .LBB11_2: +; GFX1264_DPP-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 +; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v8 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v11, v2 +; GFX1264_DPP-NEXT: v_readfirstlane_b32 s3, v9 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_sub_co_u32 v8, vcc, s2, v10 +; GFX1264_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1264_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc, s3, v11, vcc +; GFX1264_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], null +; GFX1264_DPP-NEXT: s_nop 0 +; GFX1264_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1264_DPP-NEXT: s_endpgm +; +; GFX1232_DPP-LABEL: sub_i64_varying: +; GFX1232_DPP: ; %bb.0: ; %entry +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1232_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v2 +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo +; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1232_DPP-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1232_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1232_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo +; GFX1232_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_readlane_b32 s7, v4, 15 +; GFX1232_DPP-NEXT: v_readlane_b32 s8, v3, 15 +; GFX1232_DPP-NEXT: v_readlane_b32 s5, v3, 31 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6 +; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1 +; GFX1232_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1232_DPP-NEXT: v_writelane_b32 v2, s8, 16 +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6 +; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1232_DPP-NEXT: s_mov_b32 s8, exec_lo +; GFX1232_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX1232_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1232_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX1232_DPP-NEXT: ; %bb.1: +; GFX1232_DPP-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4 +; GFX1232_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 +; GFX1232_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1232_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1232_DPP-NEXT: buffer_atomic_sub_u64 v[8:9], off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX1232_DPP-NEXT: s_wait_loadcnt 0x0 +; GFX1232_DPP-NEXT: global_inv scope:SCOPE_DEV +; GFX1232_DPP-NEXT: .LBB11_2: +; GFX1232_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 +; GFX1232_DPP-NEXT: v_readfirstlane_b32 s2, v8 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v11, v2 +; GFX1232_DPP-NEXT: v_readfirstlane_b32 s3, v9 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s2, v10 +; GFX1232_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1232_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo +; GFX1232_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], null +; GFX1232_DPP-NEXT: s_nop 0 +; GFX1232_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1232_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %zext = zext i32 %lane to i64 @@ -3883,6 +7479,3 @@ entry: store i64 %old, ptr addrspace(1) %out ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX11: {{.*}} -; GFX12: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 3784af443c7f1..1439d4b40c951 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -1,11 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s +; RUN: llc -mtriple=amdgcn - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_ITERATIVE %s +; RUN: llc -mtriple=amdgcn - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_DPP %s declare i32 @llvm.amdgcn.workitem.id.x() @@ -15,8 +22,6 @@ declare i32 @llvm.amdgcn.workitem.id.x() ; Show what the atomic optimization pass will do for local pointers. define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { -; -; ; GFX7LESS-LABEL: add_i32_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec @@ -229,8 +234,6 @@ entry: } define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) { -; -; ; GFX7LESS-LABEL: add_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec @@ -460,277 +463,633 @@ entry: } define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { +; GFX7LESS_ITERATIVE-LABEL: add_i32_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB2_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB2_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX7LESS_ITERATIVE-NEXT: v_add_i32_e32 v0, vcc, s4, v1 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm ; +; GFX8_ITERATIVE-LABEL: add_i32_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX8_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB2_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_add_u32_e32 v0, vcc, s4, v1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm ; -; GFX7LESS-LABEL: add_i32_varying: -; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX7LESS-NEXT: s_endpgm +; GFX9_ITERATIVE-LABEL: add_i32_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX9_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX9_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB2_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_add_u32_e32 v0, s4, v1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm ; -; GFX8-LABEL: add_i32_varying: -; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: s_mov_b32 s4, 0 -; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: .LBB2_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX8-NEXT: s_mov_b32 m0, s5 -; GFX8-NEXT: v_readlane_b32 s8, v0, s5 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX8-NEXT: v_writelane_b32 v1, s4, m0 -; GFX8-NEXT: s_add_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8-NEXT: s_cbranch_execz .LBB2_4 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v2 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB2_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX8-NEXT: s_endpgm +; GFX1064_ITERATIVE-LABEL: add_i32_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1064_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB2_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm ; -; GFX9-LABEL: add_i32_varying: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: .LBB2_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX9-NEXT: s_mov_b32 m0, s5 -; GFX9-NEXT: v_readlane_b32 s8, v0, s5 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX9-NEXT: v_writelane_b32 v1, s4, m0 -; GFX9-NEXT: s_add_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB2_4 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB2_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_add_u32_e32 v0, s4, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX9-NEXT: s_endpgm +; GFX1032_ITERATIVE-LABEL: add_i32_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1032_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1032_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB2_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm ; -; GFX1064-LABEL: add_i32_varying: -; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s4, 0 -; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: .LBB2_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 -; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX1064-NEXT: s_add_i32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: s_cbranch_scc1 .LBB2_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB2_4 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v2 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB2_4: -; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_add_nc_u32_e32 v0, s2, v1 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX1064-NEXT: s_endpgm +; GFX1164_ITERATIVE-LABEL: add_i32_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s5 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_ITERATIVE-NEXT: ds_add_rtn_u32 v1, v1, v2 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB2_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm ; -; GFX1032-LABEL: add_i32_varying: -; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: .LBB2_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 -; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032-NEXT: s_add_i32 s0, s0, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1032-NEXT: s_cbranch_scc1 .LBB2_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032-NEXT: s_cbranch_execz .LBB2_4 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s0 -; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v2 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB2_4: -; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_add_nc_u32_e32 v0, s2, v1 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX1032-NEXT: s_endpgm +; GFX1132_ITERATIVE-LABEL: add_i32_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX1132_ITERATIVE-NEXT: ds_add_rtn_u32 v1, v1, v2 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB2_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm ; -; GFX1164-LABEL: add_i32_varying: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_mov_b32 s4, 0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: .LBB2_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 -; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-NEXT: s_add_i32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: s_cbranch_scc1 .LBB2_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB2_4 -; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_add_rtn_u32 v1, v1, v2 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB2_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1164-NEXT: s_endpgm +; GFX7LESS_DPP-LABEL: add_i32_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: ds_add_rtn_u32 v0, v1, v0 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_DPP-NEXT: s_endpgm ; -; GFX1132-LABEL: add_i32_varying: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: .LBB2_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 -; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 -; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: s_add_i32 s0, s0, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1132-NEXT: s_cbranch_scc1 .LBB2_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132-NEXT: s_cbranch_execz .LBB2_4 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB2_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1132-NEXT: s_endpgm +; GFX8_DPP-LABEL: add_i32_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_add_rtn_u32 v0, v3, v0 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB2_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: add_i32_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: ds_add_rtn_u32 v0, v3, v0 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB2_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: add_i32_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064_DPP-NEXT: ds_add_rtn_u32 v0, v4, v0 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB2_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: add_i32_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_DPP-NEXT: ds_add_rtn_u32 v0, v4, v0 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB2_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: add_i32_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164_DPP-NEXT: ds_add_rtn_u32 v0, v4, v0 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB2_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: add_i32_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132_DPP-NEXT: ds_add_rtn_u32 v0, v4, v0 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB2_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = atomicrmw add ptr addrspace(3) @local_var32, i32 %lane acq_rel @@ -739,191 +1098,428 @@ entry: } define amdgpu_kernel void @add_i32_varying_nouse() { -; GFX7LESS-LABEL: add_i32_varying_nouse: -; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: ds_add_u32 v1, v0 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_endpgm +; GFX7LESS_ITERATIVE-LABEL: add_i32_varying_nouse: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX7LESS_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB3_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB3_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_add_u32 v0, v1 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB3_4: +; GFX7LESS_ITERATIVE-NEXT: s_endpgm ; -; GFX8-LABEL: add_i32_varying_nouse: -; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: s_mov_b32 s2, 0 -; GFX8-NEXT: .LBB3_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX8-NEXT: v_readlane_b32 s6, v0, s3 -; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX8-NEXT: s_add_i32 s2, s2, s6 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8-NEXT: s_cbranch_execz .LBB3_4 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_add_u32 v0, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB3_4: -; GFX8-NEXT: s_endpgm +; GFX8_ITERATIVE-LABEL: add_i32_varying_nouse: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX8_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB3_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_add_u32 v0, v1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB3_4: +; GFX8_ITERATIVE-NEXT: s_endpgm ; -; GFX9-LABEL: add_i32_varying_nouse: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s2, 0 -; GFX9-NEXT: .LBB3_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX9-NEXT: v_readlane_b32 s6, v0, s3 -; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX9-NEXT: s_add_i32 s2, s2, s6 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB3_4 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: ds_add_u32 v0, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB3_4: -; GFX9-NEXT: s_endpgm +; GFX9_ITERATIVE-LABEL: add_i32_varying_nouse: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX9_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB3_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 +; GFX9_ITERATIVE-NEXT: ds_add_u32 v0, v1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB3_4: +; GFX9_ITERATIVE-NEXT: s_endpgm ; -; GFX1064-LABEL: add_i32_varying_nouse: -; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s2, 0 -; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s6, v0, s3 -; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX1064-NEXT: s_add_i32 s2, s2, s6 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB3_4 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: ds_add_u32 v0, v1 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB3_4: -; GFX1064-NEXT: s_endpgm +; GFX1064_ITERATIVE-LABEL: add_i32_varying_nouse: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX1064_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB3_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064_ITERATIVE-NEXT: ds_add_u32 v0, v1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB3_4: +; GFX1064_ITERATIVE-NEXT: s_endpgm ; -; GFX1032-LABEL: add_i32_varying_nouse: -; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: .LBB3_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s2, s1 -; GFX1032-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1032-NEXT: s_andn2_b32 s1, s1, s2 -; GFX1032-NEXT: s_add_i32 s0, s0, s3 -; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032-NEXT: s_cbranch_execz .LBB3_4 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v1, s0 -; GFX1032-NEXT: ds_add_u32 v0, v1 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB3_4: -; GFX1032-NEXT: s_endpgm +; GFX1032_ITERATIVE-LABEL: add_i32_varying_nouse: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1032_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 +; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB3_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032_ITERATIVE-NEXT: ds_add_u32 v0, v1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB3_4: +; GFX1032_ITERATIVE-NEXT: s_endpgm ; -; GFX1164-LABEL: add_i32_varying_nouse: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_mov_b32 s2, 0 -; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s3, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s6, v0, s3 -; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] -; GFX1164-NEXT: s_add_i32 s2, s2, s6 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB3_4 -; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: ds_add_u32 v0, v1 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB3_4: -; GFX1164-NEXT: s_endpgm +; GFX1164_ITERATIVE-LABEL: add_i32_varying_nouse: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX1164_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] +; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB3_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164_ITERATIVE-NEXT: ds_add_u32 v0, v1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB3_4: +; GFX1164_ITERATIVE-NEXT: s_endpgm ; -; GFX1132-LABEL: add_i32_varying_nouse: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s2, s1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1132-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: s_and_not1_b32 s1, s1, s2 -; GFX1132-NEXT: s_add_i32 s0, s0, s3 -; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132-NEXT: s_cbranch_execz .LBB3_4 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 -; GFX1132-NEXT: ds_add_u32 v0, v1 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB3_4: -; GFX1132-NEXT: s_endpgm +; GFX1132_ITERATIVE-LABEL: add_i32_varying_nouse: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1132_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB3_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX1132_ITERATIVE-NEXT: ds_add_u32 v0, v1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB3_4: +; GFX1132_ITERATIVE-NEXT: s_endpgm +; +; GFX7LESS_DPP-LABEL: add_i32_varying_nouse: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: ds_add_u32 v1, v0 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_endpgm +; +; GFX8_DPP-LABEL: add_i32_varying_nouse: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b32 s0, s2 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB3_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_add_u32 v2, v0 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB3_2: +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: add_i32_varying_nouse: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b32 s0, s2 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB3_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX9_DPP-NEXT: ds_add_u32 v2, v0 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB3_2: +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: add_i32_varying_nouse: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 0 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: s_add_i32 s0, s2, s3 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB3_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064_DPP-NEXT: ds_add_u32 v0, v3 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB3_2: +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: add_i32_varying_nouse: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB3_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: ds_add_u32 v0, v3 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB3_2: +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: add_i32_varying_nouse: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB3_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: ds_add_u32 v0, v3 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB3_2: +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: add_i32_varying_nouse: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1 +; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB3_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: ds_add_u32 v0, v3 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB3_2: +; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = atomicrmw add ptr addrspace(3) @local_var32, i32 %lane acq_rel @@ -931,8 +1527,6 @@ entry: } define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { -; -; ; GFX7LESS-LABEL: add_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec @@ -1163,8 +1757,6 @@ entry: } define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) { -; -; ; GFX7LESS-LABEL: add_i64_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec @@ -1441,120 +2033,1653 @@ entry: } define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { +; GFX7LESS_ITERATIVE-LABEL: add_i64_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s6 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX7LESS_ITERATIVE-NEXT: s_add_u32 s0, s0, s8 +; GFX7LESS_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[4:5], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB6_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB6_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX7LESS_ITERATIVE-NEXT: v_add_i32_e32 v0, vcc, s5, v1 +; GFX7LESS_ITERATIVE-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm +; +; GFX8_ITERATIVE-LABEL: add_i64_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s6 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX8_ITERATIVE-NEXT: s_add_u32 s0, s0, s8 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB6_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX8_ITERATIVE-NEXT: v_add_u32_e32 v0, vcc, s5, v1 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm +; +; GFX9_ITERATIVE-LABEL: add_i64_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s6 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX9_ITERATIVE-NEXT: s_add_u32 s0, s0, s8 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB6_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX9_ITERATIVE-NEXT: v_add_co_u32_e32 v0, vcc, s5, v1 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v2, vcc +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm +; +; GFX1064_ITERATIVE-LABEL: add_i64_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6 +; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 +; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB6_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1064_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v1 +; GFX1064_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm +; +; GFX1032_ITERATIVE-LABEL: add_i64_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 +; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 +; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB6_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1032_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1 +; GFX1032_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm +; +; GFX1164_ITERATIVE-LABEL: add_i64_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6 +; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1164_ITERATIVE-NEXT: ds_add_rtn_u64 v[2:3], v4, v[2:3] +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB6_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v0 +; GFX1164_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm +; +; GFX1132_ITERATIVE-LABEL: add_i64_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s5 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s5 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 +; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1132_ITERATIVE-NEXT: ds_add_rtn_u64 v[2:3], v4, v[2:3] +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB6_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s2, v0 +; GFX1132_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm +; +; GFX7LESS_DPP-LABEL: add_i64_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7LESS_DPP-NEXT: s_endpgm +; +; GFX8_DPP-LABEL: add_i64_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB6_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_add_rtn_u64 v[9:10], v8, v[9:10] +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB6_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v10 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v9 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_DPP-NEXT: v_add_u32_e32 v7, vcc, s5, v7 +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v8, vcc, v0, v8, vcc +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: add_i64_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB6_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX9_DPP-NEXT: ds_add_rtn_u64 v[9:10], v8, v[9:10] +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB6_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v10 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v9 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v7, vcc, s5, v7 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v8, vcc, v0, v8, vcc +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: add_i64_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v3, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s4, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s5, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v4, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s9, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s8, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB6_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1064_DPP-NEXT: ds_add_rtn_u64 v[11:12], v10, v[11:12] +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB6_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_add_co_u32 v9, vcc, s0, v9 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc, s1, v10, vcc +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: add_i64_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo +; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v4, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v3, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v2, s6, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v1, s5, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB6_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1032_DPP-NEXT: ds_add_rtn_u64 v[11:12], v10, v[11:12] +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB6_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_add_co_u32 v9, vcc_lo, s0, v9 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s1, v10, vcc_lo +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: add_i64_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s5 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s4, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s5, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s6, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s9, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB6_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s0 +; GFX1164_DPP-NEXT: ds_add_rtn_u64 v[10:11], v9, v[10:11] +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB6_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v2 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v11 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_u32 v8, vcc, s0, v8 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc, s1, v9, vcc +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: add_i64_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v4, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v3, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v1, s5, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v2, s6, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB6_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 +; GFX1132_DPP-NEXT: ds_add_rtn_u64 v[10:11], v9, v[10:11] +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB6_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v2 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v11 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s0, v8 +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s1, v9, vcc_lo +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %zext = zext i32 %lane to i64 + %old = atomicrmw add ptr addrspace(3) @local_var64, i64 %zext acq_rel + store i64 %old, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @add_i64_varying_nouse() { +; GFX7LESS_ITERATIVE-LABEL: add_i64_varying_nouse: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[2:3] +; GFX7LESS_ITERATIVE-NEXT: s_nop 0 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s4 +; GFX7LESS_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 +; GFX7LESS_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB7_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB7_4: +; GFX7LESS_ITERATIVE-NEXT: s_endpgm +; +; GFX8_ITERATIVE-LABEL: add_i64_varying_nouse: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[2:3] +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s4 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 +; GFX8_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 +; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1] +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB7_4: +; GFX8_ITERATIVE-NEXT: s_endpgm +; +; GFX9_ITERATIVE-LABEL: add_i64_varying_nouse: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[2:3] +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s4 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 +; GFX9_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 +; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1 +; GFX9_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1] +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB7_4: +; GFX9_ITERATIVE-NEXT: s_endpgm +; +; GFX1064_ITERATIVE-LABEL: add_i64_varying_nouse: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[2:3] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s4 +; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s5 +; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1 +; GFX1064_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1] +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB7_4: +; GFX1064_ITERATIVE-NEXT: s_endpgm +; +; GFX1032_ITERATIVE-LABEL: add_i64_varying_nouse: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1032_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s3 +; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s4 +; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1] +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB7_4: +; GFX1032_ITERATIVE-NEXT: s_endpgm +; +; GFX1164_ITERATIVE-LABEL: add_i64_varying_nouse: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s4, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s4 +; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s5 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1 +; GFX1164_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1] +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB7_4: +; GFX1164_ITERATIVE-NEXT: s_endpgm +; +; GFX1132_ITERATIVE-LABEL: add_i64_varying_nouse: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1132_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s3 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s4 +; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX1132_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1] +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB7_4: +; GFX1132_ITERATIVE-NEXT: s_endpgm +; +; GFX7LESS_DPP-LABEL: add_i64_varying_nouse: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: ds_add_u64 v1, v[0:1] +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_endpgm ; +; GFX8_DPP-LABEL: add_i64_varying_nouse: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc +; GFX8_DPP-NEXT: v_readlane_b32 s3, v2, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB7_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_add_u64 v8, v[9:10] +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB7_2: +; GFX8_DPP-NEXT: s_endpgm ; -; GFX7LESS-LABEL: add_i64_varying: +; GFX9_DPP-LABEL: add_i64_varying_nouse: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v1, vcc, v3, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v4, v2, vcc +; GFX9_DPP-NEXT: v_readlane_b32 s3, v2, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB7_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX9_DPP-NEXT: ds_add_u64 v8, v[9:10] +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB7_2: +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: add_i64_varying_nouse: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v3, v1 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v4, v2, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v3 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v4, vcc +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 0 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 0 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 32 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_add_u32 s0, s3, s4 +; GFX1064_DPP-NEXT: s_addc_u32 s1, s2, s5 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB7_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1064_DPP-NEXT: ds_add_u64 v10, v[11:12] +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB7_2: +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: add_i64_varying_nouse: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v3, v1 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v4, v2, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB7_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: ds_add_u64 v10, v[11:12] +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB7_2: +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: add_i64_varying_nouse: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v6, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v6 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v7 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v4, v1, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v5, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_u32 v2, vcc, v3, v2 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlane64_b32 v3, v2 +; GFX1164_DPP-NEXT: v_permlane64_b32 v4, v1 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_add_co_u32 v2, vcc, v2, v3 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v1, v4, vcc +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v3 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB7_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: ds_add_u64 v7, v[8:9] +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB7_2: +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: add_i64_varying_nouse: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_and_b32 v6, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v6 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v7 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v4, v1, vcc_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132_DPP-NEXT: v_add_co_u32 v2, vcc_lo, v3, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v4, vcc_lo +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v3 +; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB7_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: ds_add_u64 v7, v[8:9] +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB7_2: +; GFX1132_DPP-NEXT: s_endpgm +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %zext = zext i32 %lane to i64 + %old = atomicrmw add ptr addrspace(3) @local_var64, i64 %zext acq_rel + ret void +} + +define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { +; GFX7LESS-LABEL: sub_i32_constant: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB8_2 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 +; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: add_i64_varying: -; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: add_i64_varying: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: add_i64_varying: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX10-NEXT: s_endpgm -; -; GFX1164-LABEL: add_i64_varying: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: add_i64_varying: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1132-NEXT: s_endpgm -entry: - %lane = call i32 @llvm.amdgcn.workitem.id.x() - %zext = zext i32 %lane to i64 - %old = atomicrmw add ptr addrspace(3) @local_var64, i64 %zext acq_rel - store i64 %old, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { -; -; -; GFX7LESS-LABEL: sub_i32_constant: -; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB7_2 -; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 -; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB7_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: .LBB8_2: +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 @@ -1572,7 +3697,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB7_2 +; GFX8-NEXT: s_cbranch_execz .LBB8_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_mul_i32 s4, s4, 5 @@ -1581,7 +3706,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB7_2: +; GFX8-NEXT: .LBB8_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 @@ -1601,7 +3726,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 @@ -1609,7 +3734,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB7_2: +; GFX9-NEXT: .LBB8_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 @@ -1629,7 +3754,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB7_2 +; GFX1064-NEXT: s_cbranch_execz .LBB8_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -1638,7 +3763,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB7_2: +; GFX1064-NEXT: .LBB8_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 @@ -1659,7 +3784,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB7_2 +; GFX1032-NEXT: s_cbranch_execz .LBB8_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -1668,7 +3793,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB7_2: +; GFX1032-NEXT: .LBB8_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 @@ -1691,7 +3816,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB7_2 +; GFX1164-NEXT: s_cbranch_execz .LBB8_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -1701,7 +3826,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB7_2: +; GFX1164-NEXT: .LBB8_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 @@ -1724,7 +3849,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB7_2 +; GFX1132-NEXT: s_cbranch_execz .LBB8_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -1733,7 +3858,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB7_2: +; GFX1132-NEXT: .LBB8_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 @@ -1754,8 +3879,6 @@ entry: } define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) { -; -; ; GFX7LESS-LABEL: sub_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec @@ -1765,7 +3888,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB8_2 +; GFX7LESS-NEXT: s_cbranch_execz .LBB9_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -1775,7 +3898,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB8_2: +; GFX7LESS-NEXT: .LBB9_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -1796,7 +3919,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB8_2 +; GFX8-NEXT: s_cbranch_execz .LBB9_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1806,7 +3929,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB8_2: +; GFX8-NEXT: .LBB9_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1827,7 +3950,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: s_cbranch_execz .LBB9_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1836,7 +3959,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB8_2: +; GFX9-NEXT: .LBB9_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1857,7 +3980,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB8_2 +; GFX1064-NEXT: s_cbranch_execz .LBB9_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -1867,7 +3990,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB8_2: +; GFX1064-NEXT: .LBB9_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 @@ -1888,7 +4011,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB8_2 +; GFX1032-NEXT: s_cbranch_execz .LBB9_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -1898,7 +4021,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB8_2: +; GFX1032-NEXT: .LBB9_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 @@ -1921,7 +4044,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB8_2 +; GFX1164-NEXT: s_cbranch_execz .LBB9_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -1932,7 +4055,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1164-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB8_2: +; GFX1164-NEXT: .LBB9_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -1956,7 +4079,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB8_2 +; GFX1132-NEXT: s_cbranch_execz .LBB9_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) @@ -1966,7 +4089,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB8_2: +; GFX1132-NEXT: .LBB9_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1132-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) @@ -1987,1097 +4110,2558 @@ entry: } define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { +; GFX7LESS_ITERATIVE-LABEL: sub_i32_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB10_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB10_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX7LESS_ITERATIVE-NEXT: v_sub_i32_e32 v0, vcc, s4, v1 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm +; +; GFX8_ITERATIVE-LABEL: sub_i32_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX8_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB10_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_sub_u32_e32 v0, vcc, s4, v1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm +; +; GFX9_ITERATIVE-LABEL: sub_i32_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX9_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX9_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB10_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_sub_u32_e32 v0, s4, v1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm +; +; GFX1064_ITERATIVE-LABEL: sub_i32_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1064_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB10_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm +; +; GFX1032_ITERATIVE-LABEL: sub_i32_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1032_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1032_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB10_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm +; +; GFX1164_ITERATIVE-LABEL: sub_i32_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s5 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_ITERATIVE-NEXT: ds_sub_rtn_u32 v1, v1, v2 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB10_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm +; +; GFX1132_ITERATIVE-LABEL: sub_i32_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX1132_ITERATIVE-NEXT: ds_sub_rtn_u32 v1, v1, v2 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB10_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm +; +; GFX7LESS_DPP-LABEL: sub_i32_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: ds_sub_rtn_u32 v0, v1, v0 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_DPP-NEXT: s_endpgm +; +; GFX8_DPP-LABEL: sub_i32_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB10_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_sub_rtn_u32 v0, v3, v0 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB10_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: sub_i32_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB10_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: ds_sub_rtn_u32 v0, v3, v0 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB10_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: sub_i32_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB10_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064_DPP-NEXT: ds_sub_rtn_u32 v0, v4, v0 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB10_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: sub_i32_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB10_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_DPP-NEXT: ds_sub_rtn_u32 v0, v4, v0 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB10_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: sub_i32_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB10_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164_DPP-NEXT: ds_sub_rtn_u32 v0, v4, v0 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB10_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: sub_i32_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB10_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132_DPP-NEXT: ds_sub_rtn_u32 v0, v4, v0 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB10_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = atomicrmw sub ptr addrspace(3) @local_var32, i32 %lane acq_rel + store i32 %old, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @sub_i32_varying_nouse() { +; GFX7LESS_ITERATIVE-LABEL: sub_i32_varying_nouse: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX7LESS_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB11_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_sub_u32 v0, v1 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB11_4: +; GFX7LESS_ITERATIVE-NEXT: s_endpgm +; +; GFX8_ITERATIVE-LABEL: sub_i32_varying_nouse: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX8_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_sub_u32 v0, v1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB11_4: +; GFX8_ITERATIVE-NEXT: s_endpgm +; +; GFX9_ITERATIVE-LABEL: sub_i32_varying_nouse: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX9_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 +; GFX9_ITERATIVE-NEXT: ds_sub_u32 v0, v1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB11_4: +; GFX9_ITERATIVE-NEXT: s_endpgm +; +; GFX1064_ITERATIVE-LABEL: sub_i32_varying_nouse: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX1064_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064_ITERATIVE-NEXT: ds_sub_u32 v0, v1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB11_4: +; GFX1064_ITERATIVE-NEXT: s_endpgm +; +; GFX1032_ITERATIVE-LABEL: sub_i32_varying_nouse: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1032_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 +; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032_ITERATIVE-NEXT: ds_sub_u32 v0, v1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB11_4: +; GFX1032_ITERATIVE-NEXT: s_endpgm +; +; GFX1164_ITERATIVE-LABEL: sub_i32_varying_nouse: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX1164_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] +; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164_ITERATIVE-NEXT: ds_sub_u32 v0, v1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB11_4: +; GFX1164_ITERATIVE-NEXT: s_endpgm ; +; GFX1132_ITERATIVE-LABEL: sub_i32_varying_nouse: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1132_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX1132_ITERATIVE-NEXT: ds_sub_u32 v0, v1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB11_4: +; GFX1132_ITERATIVE-NEXT: s_endpgm ; -; GFX7LESS-LABEL: sub_i32_varying: +; GFX7LESS_DPP-LABEL: sub_i32_varying_nouse: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: ds_sub_u32 v1, v0 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_endpgm +; +; GFX8_DPP-LABEL: sub_i32_varying_nouse: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b32 s0, s2 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_sub_u32 v2, v0 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB11_2: +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: sub_i32_varying_nouse: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b32 s0, s2 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX9_DPP-NEXT: ds_sub_u32 v2, v0 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB11_2: +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: sub_i32_varying_nouse: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 0 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: s_add_i32 s0, s2, s3 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064_DPP-NEXT: ds_sub_u32 v0, v3 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB11_2: +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: sub_i32_varying_nouse: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: ds_sub_u32 v0, v3 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB11_2: +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: sub_i32_varying_nouse: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: ds_sub_u32 v0, v3 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB11_2: +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: sub_i32_varying_nouse: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1 +; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: ds_sub_u32 v0, v3 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB11_2: +; GFX1132_DPP-NEXT: s_endpgm +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = atomicrmw sub ptr addrspace(3) @local_var32, i32 %lane acq_rel + ret void +} + +define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { +; GFX7LESS-LABEL: sub_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB12_2 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 +; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_sub_rtn_u32 v0, v1, v0 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: .LBB12_2: +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v0 +; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s5, v0 +; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; -; GFX8-LABEL: sub_i32_varying: +; GFX8-LABEL: sub_i64_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: s_mov_b32 s4, 0 -; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: .LBB9_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX8-NEXT: s_mov_b32 m0, s5 -; GFX8-NEXT: v_readlane_b32 s8, v0, s5 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX8-NEXT: v_writelane_b32 v1, s4, m0 -; GFX8-NEXT: s_add_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX8-NEXT: s_cbranch_scc1 .LBB9_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8-NEXT: s_cbranch_execz .LBB9_4 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_cbranch_execz .LBB12_2 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX8-NEXT: s_mul_i32 s4, s4, 5 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v2 +; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB9_4: +; GFX8-NEXT: .LBB12_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8-NEXT: v_readfirstlane_b32 s4, v1 +; GFX8-NEXT: v_readfirstlane_b32 s5, v0 +; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s5, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v1 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; -; GFX9-LABEL: sub_i32_varying: +; GFX9-LABEL: sub_i64_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: .LBB9_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX9-NEXT: s_mov_b32 m0, s5 -; GFX9-NEXT: v_readlane_b32 s8, v0, s5 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX9-NEXT: v_writelane_b32 v1, s4, m0 -; GFX9-NEXT: s_add_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_cbranch_scc1 .LBB9_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB9_4 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v2 +; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-NEXT: s_mul_i32 s4, s4, 5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: .LBB12_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: v_readfirstlane_b32 s5, v0 +; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s5, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v1 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; -; GFX1064-LABEL: sub_i32_varying: +; GFX1064-LABEL: sub_i64_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s4, 0 -; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: .LBB9_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 -; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX1064-NEXT: s_add_i32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0 +; GFX1064-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB9_4 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: ds_sub_rtn_u32 v0, v0, v2 +; GFX1064-NEXT: s_cbranch_execz .LBB12_2 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: s_mul_i32 s4, s4, 5 +; GFX1064-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB9_4: +; GFX1064-NEXT: .LBB12_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v1 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0 +; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; -; GFX1032-LABEL: sub_i32_varying: +; GFX1032-LABEL: sub_i64_constant: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: .LBB9_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 -; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032-NEXT: s_add_i32 s0, s0, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032-NEXT: s_cbranch_execz .LBB9_4 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s0 -; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v2 +; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB12_2 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: s_mul_i32 s1, s1, 5 +; GFX1032-NEXT: v_mov_b32_e32 v0, s1 +; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB9_4: +; GFX1032-NEXT: .LBB12_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 +; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v1 ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; -; GFX1164-LABEL: sub_i32_varying: +; GFX1164-LABEL: sub_i64_constant: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_mov_b32 s4, 0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: .LBB9_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 -; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-NEXT: s_add_i32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB9_4 -; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_cbranch_execz .LBB12_2 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_sub_rtn_u32 v1, v1, v2 +; GFX1164-NEXT: s_mul_i32 s4, s4, 5 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB9_4: +; GFX1164-NEXT: .LBB12_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1164-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v0 +; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; -; GFX1132-LABEL: sub_i32_varying: +; GFX1132-LABEL: sub_i64_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: .LBB9_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 -; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 -; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: s_add_i32 s0, s0, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132-NEXT: s_cbranch_execz .LBB9_4 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_cbranch_execz .LBB12_2 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX1132-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132-NEXT: s_mul_i32 s1, s1, 5 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_mov_b32_e32 v0, s1 +; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB9_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132-NEXT: .LBB12_2: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1132-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 +; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: - %lane = call i32 @llvm.amdgcn.workitem.id.x() - %old = atomicrmw sub ptr addrspace(3) @local_var32, i32 %lane acq_rel - store i32 %old, ptr addrspace(1) %out + %old = atomicrmw sub ptr addrspace(3) @local_var64, i64 5 acq_rel + store i64 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @sub_i32_varying_nouse() { -; GFX7LESS-LABEL: sub_i32_varying_nouse: +define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) { +; GFX7LESS-LABEL: sub_i64_uniform: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB13_2 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 +; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: ds_sub_u32 v1, v0 +; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: .LBB13_2: +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_mov_b32 s4, s0 +; GFX7LESS-NEXT: s_mov_b32 s5, s1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v0 +; GFX7LESS-NEXT: v_mul_lo_u32 v0, s3, v2 +; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v2 +; GFX7LESS-NEXT: v_mul_lo_u32 v2, s2, v2 +; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s0 +; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s1, v2 +; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm ; -; GFX8-LABEL: sub_i32_varying_nouse: +; GFX8-LABEL: sub_i64_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: s_mov_b32 s2, 0 -; GFX8-NEXT: .LBB10_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX8-NEXT: v_readlane_b32 s6, v0, s3 -; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX8-NEXT: s_add_i32 s2, s2, s6 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX8-NEXT: s_cbranch_scc1 .LBB10_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8-NEXT: s_cbranch_execz .LBB10_4 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_cbranch_execz .LBB13_2 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0 +; GFX8-NEXT: s_mul_i32 s6, s3, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_sub_u32 v0, v1 +; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: .LBB13_2: +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB10_4: +; GFX8-NEXT: s_mov_b32 s4, s0 +; GFX8-NEXT: s_mov_b32 s5, s1 +; GFX8-NEXT: v_mul_lo_u32 v4, s3, v2 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v1 +; GFX8-NEXT: v_readfirstlane_b32 s1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s1, v2 +; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; -; GFX9-LABEL: sub_i32_varying_nouse: +; GFX9-LABEL: sub_i64_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s2, 0 -; GFX9-NEXT: .LBB10_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX9-NEXT: v_readlane_b32 s6, v0, s3 -; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX9-NEXT: s_add_i32 s2, s2, s6 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_cbranch_scc1 .LBB10_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB10_4 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: ds_sub_u32 v0, v1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB13_2 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB10_4: +; GFX9-NEXT: s_mul_i32 s7, s3, s6 +; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 +; GFX9-NEXT: s_add_i32 s8, s8, s7 +; GFX9-NEXT: s_mul_i32 s6, s2, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: .LBB13_2: +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5] +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s1, v3 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; -; GFX1064-LABEL: sub_i32_varying_nouse: +; GFX1064-LABEL: sub_i64_uniform: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s2, 0 -; GFX1064-NEXT: .LBB10_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s6, v0, s3 -; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX1064-NEXT: s_add_i32 s2, s2, s6 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB10_4 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: ds_sub_u32 v0, v1 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB13_2 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_mul_i32 s7, s3, s6 +; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 +; GFX1064-NEXT: s_mul_i32 s6, s2, s6 +; GFX1064-NEXT: s_add_i32 s8, s8, s7 +; GFX1064-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064-NEXT: v_mov_b32_e32 v1, s8 +; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB10_4: +; GFX1064-NEXT: .LBB13_2: +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0 +; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 +; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5] +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v3 +; GFX1064-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc +; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; -; GFX1032-LABEL: sub_i32_varying_nouse: +; GFX1032-LABEL: sub_i64_uniform: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: .LBB10_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s2, s1 -; GFX1032-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1032-NEXT: s_andn2_b32 s1, s1, s2 -; GFX1032-NEXT: s_add_i32 s0, s0, s3 -; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032-NEXT: s_cbranch_execz .LBB10_4 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v1, s0 -; GFX1032-NEXT: ds_sub_u32 v0, v1 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB10_4: -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: sub_i32_varying_nouse: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_mov_b32 s2, 0 -; GFX1164-NEXT: .LBB10_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s3, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s6, v0, s3 -; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] -; GFX1164-NEXT: s_add_i32 s2, s2, s6 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB10_4 -; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: ds_sub_u32 v0, v1 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB10_4: -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: sub_i32_varying_nouse: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: .LBB10_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s2, s1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1132-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: s_and_not1_b32 s1, s1, s2 -; GFX1132-NEXT: s_add_i32 s0, s0, s3 -; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132-NEXT: s_cbranch_execz .LBB10_4 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 -; GFX1132-NEXT: ds_sub_u32 v0, v1 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB10_4: -; GFX1132-NEXT: s_endpgm -entry: - %lane = call i32 @llvm.amdgcn.workitem.id.x() - %old = atomicrmw sub ptr addrspace(3) @local_var32, i32 %lane acq_rel - ret void -} - -define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { -; -; -; GFX7LESS-LABEL: sub_i64_constant: -; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB11_2 -; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 -; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB11_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v0 -; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 -; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s5, v0 -; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: sub_i64_constant: -; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[4:5], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB11_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX8-NEXT: s_mul_i32 s4, s4, 5 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB11_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: v_readfirstlane_b32 s4, v1 -; GFX8-NEXT: v_readfirstlane_b32 s5, v0 -; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s5, v0 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: sub_i64_constant: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB11_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-NEXT: s_mul_i32 s4, s4, 5 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB11_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: v_readfirstlane_b32 s5, v0 -; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s5, v0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: sub_i64_constant: -; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB11_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_mul_i32 s4, s4, 5 -; GFX1064-NEXT: v_mov_b32_e32 v0, s4 -; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB11_2: -; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0 -; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: sub_i64_constant: -; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 s5, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB11_2 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB13_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mul_i32 s1, s1, 5 -; GFX1032-NEXT: v_mov_b32_e32 v0, s1 -; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] +; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_mul_i32 s6, s3, s5 +; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 +; GFX1032-NEXT: s_mul_i32 s5, s2, s5 +; GFX1032-NEXT: s_add_i32 s7, s7, s6 +; GFX1032-NEXT: v_mov_b32_e32 v0, s5 +; GFX1032-NEXT: v_mov_b32_e32 v1, s7 +; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB11_2: +; GFX1032-NEXT: .LBB13_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s2, s2, v2, 0 +; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 +; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s2, s3, v2, v[4:5] ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 -; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 +; GFX1032-NEXT: v_mov_b32_e32 v1, v4 ; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; -; GFX1164-LABEL: sub_i64_constant: +; GFX1164-LABEL: sub_i64_uniform: ; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB11_2 +; GFX1164-NEXT: s_cbranch_execz .LBB13_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_mul_i32 s4, s4, 5 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mov_b32_e32 v0, s4 -; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] +; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_mul_i32 s7, s3, s6 +; GFX1164-NEXT: s_mul_hi_u32 s8, s2, s6 +; GFX1164-NEXT: s_mul_i32 s6, s2, s6 +; GFX1164-NEXT: s_add_i32 s8, s8, s7 +; GFX1164-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164-NEXT: v_mov_b32_e32 v1, s8 +; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB11_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: .LBB13_2: +; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1164-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v0 -; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX1164-NEXT: v_readfirstlane_b32 s4, v1 +; GFX1164-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5] +; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v3 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mov_b32_e32 v1, v5 +; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; -; GFX1132-LABEL: sub_i64_constant: +; GFX1132-LABEL: sub_i64_uniform: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 +; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b32 s5, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB11_2 +; GFX1132-NEXT: s_cbranch_execz .LBB13_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s1 -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_mul_i32 s1, s1, 5 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mov_b32_e32 v0, s1 -; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] +; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_mul_i32 s6, s3, s5 +; GFX1132-NEXT: s_mul_hi_u32 s7, s2, s5 +; GFX1132-NEXT: s_mul_i32 s5, s2, s5 +; GFX1132-NEXT: s_add_i32 s7, s7, s6 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s7 +; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB11_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: .LBB13_2: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1132-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 -; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX1132-NEXT: v_readfirstlane_b32 s4, v1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5] +; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_mov_b32_e32 v1, v5 +; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: - %old = atomicrmw sub ptr addrspace(3) @local_var64, i64 5 acq_rel + %old = atomicrmw sub ptr addrspace(3) @local_var64, i64 %subitive acq_rel store i64 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) { +define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { +; GFX7LESS_ITERATIVE-LABEL: sub_i64_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s6 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX7LESS_ITERATIVE-NEXT: s_add_u32 s0, s0, s8 +; GFX7LESS_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[4:5], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB14_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB14_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX7LESS_ITERATIVE-NEXT: v_sub_i32_e32 v0, vcc, s5, v1 +; GFX7LESS_ITERATIVE-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm ; +; GFX8_ITERATIVE-LABEL: sub_i64_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s6 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX8_ITERATIVE-NEXT: s_add_u32 s0, s0, s8 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB14_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX8_ITERATIVE-NEXT: v_sub_u32_e32 v0, vcc, s5, v1 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm ; -; GFX7LESS-LABEL: sub_i64_uniform: -; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB12_2 -; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 -; GFX7LESS-NEXT: v_mul_hi_u32 v0, s2, v0 -; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 -; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 -; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB12_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s6, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s4, s0 -; GFX7LESS-NEXT: s_mov_b32 s5, s1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v0 -; GFX7LESS-NEXT: v_mul_lo_u32 v0, s3, v2 -; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v2 -; GFX7LESS-NEXT: v_mul_lo_u32 v2, s2, v2 -; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s0 -; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s1, v2 -; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc -; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX7LESS-NEXT: s_endpgm +; GFX9_ITERATIVE-LABEL: sub_i64_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s6 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX9_ITERATIVE-NEXT: s_add_u32 s0, s0, s8 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB14_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX9_ITERATIVE-NEXT: v_sub_co_u32_e32 v0, vcc, s5, v1 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v2, vcc +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm ; -; GFX8-LABEL: sub_i64_uniform: -; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB12_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0 -; GFX8-NEXT: s_mul_i32 s6, s3, s8 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB12_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, s0 -; GFX8-NEXT: s_mov_b32 s5, s1 -; GFX8-NEXT: v_mul_lo_u32 v4, s3, v2 -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0 -; GFX8-NEXT: v_readfirstlane_b32 s0, v1 -; GFX8-NEXT: v_readfirstlane_b32 s1, v0 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s1, v2 -; GFX8-NEXT: s_mov_b32 s7, 0xf000 -; GFX8-NEXT: s_mov_b32 s6, -1 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX8-NEXT: s_endpgm +; GFX1064_ITERATIVE-LABEL: sub_i64_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6 +; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 +; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB14_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1064_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v1 +; GFX1064_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm ; -; GFX9-LABEL: sub_i64_uniform: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b64 s[6:7], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB12_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s7, s3, s6 -; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 -; GFX9-NEXT: s_add_i32 s8, s8, s7 -; GFX9-NEXT: s_mul_i32 s6, s2, s6 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB12_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: v_readfirstlane_b32 s1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s1, v3 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX9-NEXT: s_endpgm +; GFX1032_ITERATIVE-LABEL: sub_i64_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 +; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 +; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB14_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1032_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1 +; GFX1032_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm ; -; GFX1064-LABEL: sub_i64_uniform: -; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b64 s[6:7], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB12_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s7, s3, s6 -; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 -; GFX1064-NEXT: s_mul_i32 s6, s2, s6 -; GFX1064-NEXT: s_add_i32 s8, s8, s7 -; GFX1064-NEXT: v_mov_b32_e32 v0, s6 -; GFX1064-NEXT: v_mov_b32_e32 v1, s8 -; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB12_2: -; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0 -; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 -; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5] -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v3 -; GFX1064-NEXT: v_mov_b32_e32 v1, v4 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc -; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX1064-NEXT: s_endpgm +; GFX1164_ITERATIVE-LABEL: sub_i64_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6 +; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1164_ITERATIVE-NEXT: ds_sub_rtn_u64 v[2:3], v4, v[2:3] +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB14_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v0 +; GFX1164_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm ; -; GFX1032-LABEL: sub_i64_uniform: -; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB12_2 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s6, s3, s5 -; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 -; GFX1032-NEXT: s_mul_i32 s5, s2, s5 -; GFX1032-NEXT: s_add_i32 s7, s7, s6 -; GFX1032-NEXT: v_mov_b32_e32 v0, s5 -; GFX1032-NEXT: v_mov_b32_e32 v1, s7 -; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB12_2: -; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s2, s2, v2, 0 -; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 -; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s2, s3, v2, v[4:5] -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 -; GFX1032-NEXT: v_mov_b32_e32 v1, v4 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo -; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX1032-NEXT: s_endpgm +; GFX1132_ITERATIVE-LABEL: sub_i64_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s5 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s5 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 +; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1132_ITERATIVE-NEXT: ds_sub_rtn_u64 v[2:3], v4, v[2:3] +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB14_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 +; GFX1132_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm ; -; GFX1164-LABEL: sub_i64_uniform: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1164-NEXT: s_mov_b64 s[6:7], exec -; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 -; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB12_2 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mul_i32 s7, s3, s6 -; GFX1164-NEXT: s_mul_hi_u32 s8, s2, s6 -; GFX1164-NEXT: s_mul_i32 s6, s2, s6 -; GFX1164-NEXT: s_add_i32 s8, s8, s7 -; GFX1164-NEXT: v_mov_b32_e32 v0, s6 -; GFX1164-NEXT: v_mov_b32_e32 v1, s8 -; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB12_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-NEXT: v_readfirstlane_b32 s4, v1 -; GFX1164-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5] -; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v3 -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v1, v5 -; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc -; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: sub_i64_uniform: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1132-NEXT: s_mov_b32 s5, exec_lo -; GFX1132-NEXT: s_mov_b32 s4, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB12_2 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mul_i32 s6, s3, s5 -; GFX1132-NEXT: s_mul_hi_u32 s7, s2, s5 -; GFX1132-NEXT: s_mul_i32 s5, s2, s5 -; GFX1132-NEXT: s_add_i32 s7, s7, s6 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s7 -; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB12_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-NEXT: v_readfirstlane_b32 s4, v1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5] -; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v1, v5 -; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo -; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1132-NEXT: s_endpgm -entry: - %old = atomicrmw sub ptr addrspace(3) @local_var64, i64 %subitive acq_rel - store i64 %old, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { +; GFX7LESS_DPP-LABEL: sub_i64_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7LESS_DPP-NEXT: s_endpgm ; +; GFX8_DPP-LABEL: sub_i64_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB14_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_sub_rtn_u64 v[9:10], v8, v[9:10] +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB14_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v10 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v9 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_DPP-NEXT: v_sub_u32_e32 v7, vcc, s5, v7 +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_subb_u32_e32 v8, vcc, v0, v8, vcc +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm ; -; GFX7LESS-LABEL: sub_i64_varying: -; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX7LESS-NEXT: s_endpgm +; GFX9_DPP-LABEL: sub_i64_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB14_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX9_DPP-NEXT: ds_sub_rtn_u64 v[9:10], v8, v[9:10] +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB14_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v10 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v9 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: v_sub_co_u32_e32 v7, vcc, s5, v7 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_subb_co_u32_e32 v8, vcc, v0, v8, vcc +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm ; -; GFX8-LABEL: sub_i64_varying: -; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX8-NEXT: s_endpgm +; GFX1064_DPP-LABEL: sub_i64_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v3, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s4, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s5, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v4, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s9, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s8, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB14_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1064_DPP-NEXT: ds_sub_rtn_u64 v[11:12], v10, v[11:12] +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB14_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_sub_co_u32 v9, vcc, s0, v9 +; GFX1064_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc, s1, v10, vcc +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1064_DPP-NEXT: s_endpgm ; -; GFX9-LABEL: sub_i64_varying: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX9-NEXT: s_endpgm +; GFX1032_DPP-LABEL: sub_i64_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo +; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v4, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v3, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v2, s6, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v1, s5, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB14_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1032_DPP-NEXT: ds_sub_rtn_u64 v[11:12], v10, v[11:12] +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB14_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_sub_co_u32 v9, vcc_lo, s0, v9 +; GFX1032_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc_lo, s1, v10, vcc_lo +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1032_DPP-NEXT: s_endpgm ; -; GFX10-LABEL: sub_i64_varying: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX10-NEXT: s_endpgm -; -; GFX1164-LABEL: sub_i64_varying: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1164-NEXT: s_endpgm +; GFX1164_DPP-LABEL: sub_i64_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s5 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s4, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s5, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s6, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s9, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB14_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s0 +; GFX1164_DPP-NEXT: ds_sub_rtn_u64 v[10:11], v9, v[10:11] +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB14_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v2 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v11 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_sub_co_u32 v8, vcc, s0, v8 +; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc, s1, v9, vcc +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm ; -; GFX1132-LABEL: sub_i64_varying: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1132-NEXT: s_endpgm +; GFX1132_DPP-LABEL: sub_i64_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v4, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v3, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v1, s5, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v2, s6, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB14_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 +; GFX1132_DPP-NEXT: ds_sub_rtn_u64 v[10:11], v9, v[10:11] +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB14_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v2 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v11 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s0, v8 +; GFX1132_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s1, v9, vcc_lo +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %zext = zext i32 %lane to i64 @@ -3087,277 +6671,638 @@ entry: } define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { +; GFX7LESS_ITERATIVE-LABEL: and_i32_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s4, -1 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_and_b32 s4, s4, s8 +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB15_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB15_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX7LESS_ITERATIVE-NEXT: v_and_b32_e32 v0, s4, v1 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm ; +; GFX8_ITERATIVE-LABEL: and_i32_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_mov_b32 s4, -1 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX8_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8_ITERATIVE-NEXT: s_and_b32 s4, s4, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB15_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_and_b32_e32 v0, s4, v1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm ; -; GFX7LESS-LABEL: and_i32_varying: -; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_and_rtn_b32 v0, v1, v0 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX7LESS-NEXT: s_endpgm +; GFX9_ITERATIVE-LABEL: and_i32_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b32 s4, -1 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX9_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9_ITERATIVE-NEXT: s_and_b32 s4, s4, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX9_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB15_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_and_b32_e32 v0, s4, v1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm ; -; GFX8-LABEL: and_i32_varying: -; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: s_mov_b32 s4, -1 -; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: .LBB14_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX8-NEXT: s_mov_b32 m0, s5 -; GFX8-NEXT: v_readlane_b32 s8, v0, s5 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX8-NEXT: v_writelane_b32 v1, s4, m0 -; GFX8-NEXT: s_and_b32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX8-NEXT: s_cbranch_scc1 .LBB14_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8-NEXT: s_cbranch_execz .LBB14_4 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v2 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB14_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_and_b32_e32 v0, s4, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX8-NEXT: s_endpgm +; GFX1064_ITERATIVE-LABEL: and_i32_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s4, -1 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1064_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_and_b32 s4, s4, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB15_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: v_and_b32_e32 v0, s2, v1 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm ; -; GFX9-LABEL: and_i32_varying: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s4, -1 -; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: .LBB14_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX9-NEXT: s_mov_b32 m0, s5 -; GFX9-NEXT: v_readlane_b32 s8, v0, s5 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX9-NEXT: v_writelane_b32 v1, s4, m0 -; GFX9-NEXT: s_and_b32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_cbranch_scc1 .LBB14_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB14_4 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB14_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_and_b32_e32 v0, s4, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX9-NEXT: s_endpgm +; GFX1032_ITERATIVE-LABEL: and_i32_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, -1 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1032_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_and_b32 s0, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1032_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB15_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: v_and_b32_e32 v0, s2, v1 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm ; -; GFX1064-LABEL: and_i32_varying: -; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s4, -1 -; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: .LBB14_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 -; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX1064-NEXT: s_and_b32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB14_4 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: ds_and_rtn_b32 v0, v0, v2 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB14_4: -; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_and_b32_e32 v0, s2, v1 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX1064-NEXT: s_endpgm +; GFX1164_ITERATIVE-LABEL: and_i32_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s4, -1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s5 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: s_and_b32 s4, s4, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_ITERATIVE-NEXT: ds_and_rtn_b32 v1, v1, v2 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB15_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm ; -; GFX1032-LABEL: and_i32_varying: -; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: s_mov_b32 s0, -1 -; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: .LBB14_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 -; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032-NEXT: s_and_b32 s0, s0, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032-NEXT: s_cbranch_execz .LBB14_4 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s0 -; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v2 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB14_4: -; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_and_b32_e32 v0, s2, v1 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX1032-NEXT: s_endpgm +; GFX1132_ITERATIVE-LABEL: and_i32_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, -1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: s_and_b32 s0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX1132_ITERATIVE-NEXT: ds_and_rtn_b32 v1, v1, v2 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB15_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm ; -; GFX1164-LABEL: and_i32_varying: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_mov_b32 s4, -1 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: .LBB14_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 -; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-NEXT: s_and_b32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB14_4 -; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_and_rtn_b32 v1, v1, v2 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB14_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_and_b32_e32 v0, s2, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1164-NEXT: s_endpgm +; GFX7LESS_DPP-LABEL: and_i32_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: ds_and_rtn_b32 v0, v1, v0 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_DPP-NEXT: s_endpgm ; -; GFX1132-LABEL: and_i32_varying: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_mov_b32 s0, -1 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: .LBB14_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 -; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 -; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: s_and_b32 s0, s0, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132-NEXT: s_cbranch_execz .LBB14_4 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX1132-NEXT: ds_and_rtn_b32 v1, v1, v2 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB14_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_and_b32_e32 v0, s2, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1132-NEXT: s_endpgm +; GFX8_DPP-LABEL: and_i32_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB15_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_and_rtn_b32 v0, v0, v3 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB15_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: and_i32_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB15_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX9_DPP-NEXT: ds_and_rtn_b32 v0, v0, v3 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB15_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: and_i32_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB15_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1064_DPP-NEXT: ds_and_rtn_b32 v0, v0, v4 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB15_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: and_i32_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB15_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032_DPP-NEXT: ds_and_rtn_b32 v0, v0, v4 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB15_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: and_i32_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB15_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1164_DPP-NEXT: ds_and_rtn_b32 v0, v0, v4 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB15_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: and_i32_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB15_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132_DPP-NEXT: ds_and_rtn_b32 v0, v0, v4 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB15_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = atomicrmw and ptr addrspace(3) @local_var32, i32 %lane acq_rel @@ -3365,1394 +7310,4362 @@ entry: ret void } -define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { +define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { +; GFX7LESS_ITERATIVE-LABEL: and_i64_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[8:9], s[4:5], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[8:9] +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB16_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB16_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX7LESS_ITERATIVE-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7LESS_ITERATIVE-NEXT: v_and_b32_e32 v1, s5, v1 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm ; +; GFX8_ITERATIVE-LABEL: and_i64_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX8_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB16_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX8_ITERATIVE-NEXT: v_and_b32_e32 v1, s5, v1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm ; -; GFX7LESS-LABEL: or_i32_varying: -; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_or_rtn_b32 v0, v1, v0 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX7LESS-NEXT: s_endpgm +; GFX9_ITERATIVE-LABEL: and_i64_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX9_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB16_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX9_ITERATIVE-NEXT: v_and_b32_e32 v1, s5, v1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm ; -; GFX8-LABEL: or_i32_varying: -; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: s_mov_b32 s4, 0 -; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: .LBB15_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX8-NEXT: s_mov_b32 m0, s5 -; GFX8-NEXT: v_readlane_b32 s8, v0, s5 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX8-NEXT: v_writelane_b32 v1, s4, m0 -; GFX8-NEXT: s_or_b32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX8-NEXT: s_cbranch_scc1 .LBB15_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8-NEXT: s_cbranch_execz .LBB15_4 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v2 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB15_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_or_b32_e32 v0, s4, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX8-NEXT: s_endpgm +; GFX1064_ITERATIVE-LABEL: and_i64_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[4:5] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX1064_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB16_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1064_ITERATIVE-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX1064_ITERATIVE-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm ; -; GFX9-LABEL: or_i32_varying: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: .LBB15_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX9-NEXT: s_mov_b32 m0, s5 -; GFX9-NEXT: v_readlane_b32 s8, v0, s5 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX9-NEXT: v_writelane_b32 v1, s4, m0 -; GFX9-NEXT: s_or_b32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_cbranch_scc1 .LBB15_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB15_4 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB15_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_or_b32_e32 v0, s4, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX9-NEXT: s_endpgm +; GFX1032_ITERATIVE-LABEL: and_i64_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s8 +; GFX1032_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB16_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1032_ITERATIVE-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX1032_ITERATIVE-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm ; -; GFX1064-LABEL: or_i32_varying: -; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s4, 0 -; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: .LBB15_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 -; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX1064-NEXT: s_or_b32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB15_4 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: ds_or_rtn_b32 v0, v0, v2 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB15_4: -; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_or_b32_e32 v0, s2, v1 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX1064-NEXT: s_endpgm +; GFX1164_ITERATIVE-LABEL: and_i64_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[8:9] +; GFX1164_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1164_ITERATIVE-NEXT: ds_and_rtn_b64 v[2:3], v4, v[2:3] +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB16_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, s2, v1 +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm ; -; GFX1032-LABEL: or_i32_varying: -; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: .LBB15_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 -; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032-NEXT: s_or_b32 s0, s0, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032-NEXT: s_cbranch_execz .LBB15_4 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s0 -; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v2 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB15_4: -; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_or_b32_e32 v0, s2, v1 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX1032-NEXT: s_endpgm +; GFX1132_ITERATIVE-LABEL: and_i64_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s5 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s8 +; GFX1132_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1132_ITERATIVE-NEXT: ds_and_rtn_b64 v[2:3], v4, v[2:3] +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB16_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, s2, v1 +; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm ; -; GFX1164-LABEL: or_i32_varying: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_mov_b32 s4, 0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: .LBB15_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 -; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-NEXT: s_or_b32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB15_4 -; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_or_rtn_b32 v1, v1, v2 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB15_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1164-NEXT: s_endpgm +; GFX7LESS_DPP-LABEL: and_i64_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: ds_and_rtn_b64 v[0:1], v1, v[0:1] +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7LESS_DPP-NEXT: s_endpgm ; -; GFX1132-LABEL: or_i32_varying: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: .LBB15_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 -; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 -; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: s_or_b32 s0, s0, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132-NEXT: s_cbranch_execz .LBB15_4 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX1132-NEXT: ds_or_rtn_b32 v1, v1, v2 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB15_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1132-NEXT: s_endpgm +; GFX8_DPP-LABEL: and_i64_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, -1 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB16_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_and_rtn_b64 v[7:8], v6, v[7:8] +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB16_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX8_DPP-NEXT: v_and_b32_e32 v5, s5, v5 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: and_i64_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, -1 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB16_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX9_DPP-NEXT: ds_and_rtn_b64 v[7:8], v6, v[7:8] +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB16_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX9_DPP-NEXT: v_and_b32_e32 v5, s5, v5 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: and_i64_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, -1 +; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s5 +; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s4, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s6, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s7, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s9, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s8, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB16_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1064_DPP-NEXT: ds_and_rtn_b64 v[9:10], v8, v[9:10] +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB16_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v6 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_and_b32_e32 v8, s0, v8 +; GFX1064_DPP-NEXT: v_and_b32_e32 v7, s1, v7 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: and_i64_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, -1 +; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v6, s6, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB16_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1032_DPP-NEXT: ds_and_rtn_b64 v[9:10], v8, v[9:10] +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB16_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v6 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_and_b32_e32 v8, s0, v8 +; GFX1032_DPP-NEXT: v_and_b32_e32 v7, s1, v7 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: and_i64_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s4, 16 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s6, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB16_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1164_DPP-NEXT: ds_and_rtn_b64 v[9:10], v8, v[9:10] +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB16_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v6 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_and_b32_e32 v8, s0, v8 +; GFX1164_DPP-NEXT: v_and_b32_e32 v7, s1, v7 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: and_i64_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v6, s6, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB16_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0 +; GFX1132_DPP-NEXT: ds_and_rtn_b64 v[9:10], v8, v[9:10] +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB16_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v6 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_and_b32_e32 v8, s0, v8 +; GFX1132_DPP-NEXT: v_and_b32_e32 v7, s1, v7 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() - %old = atomicrmw or ptr addrspace(3) @local_var32, i32 %lane acq_rel - store i32 %old, ptr addrspace(1) %out + %lane_ext = zext i32 %lane to i64 + %old = atomicrmw and ptr addrspace(3) @local_var32, i64 %lane_ext acq_rel + store i64 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { +define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { +; GFX7LESS_ITERATIVE-LABEL: or_i32_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_or_b32 s4, s4, s8 +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB17_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB17_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX7LESS_ITERATIVE-NEXT: v_or_b32_e32 v0, s4, v1 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm ; +; GFX8_ITERATIVE-LABEL: or_i32_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX8_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8_ITERATIVE-NEXT: s_or_b32 s4, s4, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB17_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_or_b32_e32 v0, s4, v1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm ; -; GFX7LESS-LABEL: xor_i32_varying: -; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_xor_rtn_b32 v0, v1, v0 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX7LESS-NEXT: s_endpgm +; GFX9_ITERATIVE-LABEL: or_i32_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX9_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9_ITERATIVE-NEXT: s_or_b32 s4, s4, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX9_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB17_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_or_b32_e32 v0, s4, v1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm ; -; GFX8-LABEL: xor_i32_varying: -; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: s_mov_b32 s4, 0 -; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: .LBB16_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX8-NEXT: s_mov_b32 m0, s5 -; GFX8-NEXT: v_readlane_b32 s8, v0, s5 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX8-NEXT: v_writelane_b32 v1, s4, m0 -; GFX8-NEXT: s_xor_b32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX8-NEXT: s_cbranch_scc1 .LBB16_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8-NEXT: s_cbranch_execz .LBB16_4 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v2 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB16_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_xor_b32_e32 v0, s4, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX8-NEXT: s_endpgm +; GFX1064_ITERATIVE-LABEL: or_i32_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1064_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_or_b32 s4, s4, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB17_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: v_or_b32_e32 v0, s2, v1 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm ; -; GFX9-LABEL: xor_i32_varying: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: .LBB16_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX9-NEXT: s_mov_b32 m0, s5 -; GFX9-NEXT: v_readlane_b32 s8, v0, s5 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX9-NEXT: v_writelane_b32 v1, s4, m0 -; GFX9-NEXT: s_xor_b32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_cbranch_scc1 .LBB16_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB16_4 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB16_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_xor_b32_e32 v0, s4, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX9-NEXT: s_endpgm +; GFX1032_ITERATIVE-LABEL: or_i32_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1032_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_or_b32 s0, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1032_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB17_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: v_or_b32_e32 v0, s2, v1 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm ; -; GFX1064-LABEL: xor_i32_varying: -; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s4, 0 -; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: .LBB16_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 -; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX1064-NEXT: s_xor_b32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: s_cbranch_scc1 .LBB16_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB16_4 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: ds_xor_rtn_b32 v0, v0, v2 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB16_4: -; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_xor_b32_e32 v0, s2, v1 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX1064-NEXT: s_endpgm +; GFX1164_ITERATIVE-LABEL: or_i32_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s5 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: s_or_b32 s4, s4, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_ITERATIVE-NEXT: ds_or_rtn_b32 v1, v1, v2 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB17_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm ; -; GFX1032-LABEL: xor_i32_varying: -; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: .LBB16_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 -; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032-NEXT: s_xor_b32 s0, s0, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1032-NEXT: s_cbranch_scc1 .LBB16_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032-NEXT: s_cbranch_execz .LBB16_4 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s0 -; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v2 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB16_4: -; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_xor_b32_e32 v0, s2, v1 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX1032-NEXT: s_endpgm +; GFX1132_ITERATIVE-LABEL: or_i32_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: s_or_b32 s0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX1132_ITERATIVE-NEXT: ds_or_rtn_b32 v1, v1, v2 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB17_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm ; -; GFX1164-LABEL: xor_i32_varying: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_mov_b32 s4, 0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: .LBB16_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 -; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-NEXT: s_xor_b32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: s_cbranch_scc1 .LBB16_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB16_4 -; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_xor_rtn_b32 v1, v1, v2 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB16_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_xor_b32_e32 v0, s2, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1164-NEXT: s_endpgm +; GFX7LESS_DPP-LABEL: or_i32_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: ds_or_rtn_b32 v0, v1, v0 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_DPP-NEXT: s_endpgm ; -; GFX1132-LABEL: xor_i32_varying: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: .LBB16_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 -; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 -; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: s_xor_b32 s0, s0, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1132-NEXT: s_cbranch_scc1 .LBB16_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132-NEXT: s_cbranch_execz .LBB16_4 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX1132-NEXT: ds_xor_rtn_b32 v1, v1, v2 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB16_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_xor_b32_e32 v0, s2, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1132-NEXT: s_endpgm +; GFX8_DPP-LABEL: or_i32_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB17_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_or_rtn_b32 v0, v3, v0 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB17_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_or_b32_e32 v0, s4, v0 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: or_i32_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB17_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: ds_or_rtn_b32 v0, v3, v0 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB17_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_or_b32_e32 v0, s4, v0 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: or_i32_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB17_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064_DPP-NEXT: ds_or_rtn_b32 v0, v4, v0 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB17_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: or_i32_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB17_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_DPP-NEXT: ds_or_rtn_b32 v0, v4, v0 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB17_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: or_i32_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB17_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164_DPP-NEXT: ds_or_rtn_b32 v0, v4, v0 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB17_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: or_i32_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB17_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132_DPP-NEXT: ds_or_rtn_b32 v0, v4, v0 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB17_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() - %old = atomicrmw xor ptr addrspace(3) @local_var32, i32 %lane acq_rel + %old = atomicrmw or ptr addrspace(3) @local_var32, i32 %lane acq_rel store i32 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { +define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { +; GFX7LESS_ITERATIVE-LABEL: or_i64_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[8:9], s[4:5], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[8:9] +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB18_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB18_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX7LESS_ITERATIVE-NEXT: v_or_b32_e32 v2, s4, v2 +; GFX7LESS_ITERATIVE-NEXT: v_or_b32_e32 v1, s5, v1 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm ; +; GFX8_ITERATIVE-LABEL: or_i64_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX8_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB18_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_or_b32_e32 v2, s4, v2 +; GFX8_ITERATIVE-NEXT: v_or_b32_e32 v1, s5, v1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm ; -; GFX7LESS-LABEL: max_i32_varying: -; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_max_rtn_i32 v0, v1, v0 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX7LESS-NEXT: s_endpgm +; GFX9_ITERATIVE-LABEL: or_i64_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX9_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB18_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_or_b32_e32 v2, s4, v2 +; GFX9_ITERATIVE-NEXT: v_or_b32_e32 v1, s5, v1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm ; -; GFX8-LABEL: max_i32_varying: -; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: s_brev_b32 s4, 1 -; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: .LBB17_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX8-NEXT: s_mov_b32 m0, s5 -; GFX8-NEXT: v_readlane_b32 s8, v0, s5 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX8-NEXT: v_writelane_b32 v1, s4, m0 -; GFX8-NEXT: s_max_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX8-NEXT: s_cbranch_scc1 .LBB17_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8-NEXT: s_cbranch_execz .LBB17_4 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v2 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB17_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_max_i32_e32 v0, s4, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX8-NEXT: s_endpgm +; GFX1064_ITERATIVE-LABEL: or_i64_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[4:5] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX1064_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB18_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1064_ITERATIVE-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX1064_ITERATIVE-NEXT: v_or_b32_e32 v1, s3, v1 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm ; -; GFX9-LABEL: max_i32_varying: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_brev_b32 s4, 1 -; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: .LBB17_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX9-NEXT: s_mov_b32 m0, s5 -; GFX9-NEXT: v_readlane_b32 s8, v0, s5 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX9-NEXT: v_writelane_b32 v1, s4, m0 -; GFX9-NEXT: s_max_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_cbranch_scc1 .LBB17_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB17_4 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB17_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_max_i32_e32 v0, s4, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX9-NEXT: s_endpgm +; GFX1032_ITERATIVE-LABEL: or_i64_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s8 +; GFX1032_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB18_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1032_ITERATIVE-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX1032_ITERATIVE-NEXT: v_or_b32_e32 v1, s3, v1 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm ; -; GFX1064-LABEL: max_i32_varying: -; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_brev_b32 s4, 1 -; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: .LBB17_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 -; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX1064-NEXT: s_max_i32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB17_4 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: ds_max_rtn_i32 v0, v0, v2 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB17_4: -; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_max_i32_e32 v0, s2, v1 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX1064-NEXT: s_endpgm +; GFX1164_ITERATIVE-LABEL: or_i64_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[8:9] +; GFX1164_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1164_ITERATIVE-NEXT: ds_or_rtn_b64 v[2:3], v4, v[2:3] +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB18_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_or_b32_e32 v1, s2, v1 +; GFX1164_ITERATIVE-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm ; -; GFX1032-LABEL: max_i32_varying: -; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: s_brev_b32 s0, 1 -; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: .LBB17_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 -; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032-NEXT: s_max_i32 s0, s0, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032-NEXT: s_cbranch_execz .LBB17_4 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s0 -; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v2 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB17_4: -; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_max_i32_e32 v0, s2, v1 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX1032-NEXT: s_endpgm +; GFX1132_ITERATIVE-LABEL: or_i64_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s5 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s8 +; GFX1132_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1132_ITERATIVE-NEXT: ds_or_rtn_b64 v[2:3], v4, v[2:3] +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB18_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: v_or_b32_e32 v1, s2, v1 +; GFX1132_ITERATIVE-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm ; -; GFX1164-LABEL: max_i32_varying: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_brev_b32 s4, 1 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: .LBB17_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 -; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-NEXT: s_max_i32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB17_4 -; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_max_rtn_i32 v1, v1, v2 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB17_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_i32_e32 v0, s2, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1164-NEXT: s_endpgm +; GFX7LESS_DPP-LABEL: or_i64_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: ds_or_rtn_b64 v[0:1], v1, v[0:1] +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7LESS_DPP-NEXT: s_endpgm ; -; GFX1132-LABEL: max_i32_varying: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_brev_b32 s0, 1 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: .LBB17_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 -; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 -; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: s_max_i32 s0, s0, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132-NEXT: s_cbranch_execz .LBB17_4 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX1132-NEXT: ds_max_rtn_i32 v1, v1, v2 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB17_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_i32_e32 v0, s2, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1132-NEXT: s_endpgm +; GFX8_DPP-LABEL: or_i64_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB18_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_or_rtn_b64 v[7:8], v6, v[7:8] +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB18_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_or_b32_e32 v6, s4, v6 +; GFX8_DPP-NEXT: v_or_b32_e32 v5, s5, v5 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: or_i64_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB18_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX9_DPP-NEXT: ds_or_rtn_b64 v[7:8], v6, v[7:8] +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB18_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_or_b32_e32 v6, s4, v6 +; GFX9_DPP-NEXT: v_or_b32_e32 v5, s5, v5 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: or_i64_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s5 +; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s4, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s6, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s7, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s9, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s8, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB18_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1064_DPP-NEXT: ds_or_rtn_b64 v[9:10], v8, v[9:10] +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB18_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v6 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_or_b32_e32 v8, s0, v8 +; GFX1064_DPP-NEXT: v_or_b32_e32 v7, s1, v7 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: or_i64_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v6, s6, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB18_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1032_DPP-NEXT: ds_or_rtn_b64 v[9:10], v8, v[9:10] +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB18_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v6 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_or_b32_e32 v8, s0, v8 +; GFX1032_DPP-NEXT: v_or_b32_e32 v7, s1, v7 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: or_i64_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s4, 16 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s6, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB18_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1164_DPP-NEXT: ds_or_rtn_b64 v[9:10], v8, v[9:10] +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB18_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v6 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_or_b32_e32 v8, s0, v8 +; GFX1164_DPP-NEXT: v_or_b32_e32 v7, s1, v7 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: or_i64_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v6, s6, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB18_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0 +; GFX1132_DPP-NEXT: ds_or_rtn_b64 v[9:10], v8, v[9:10] +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB18_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v6 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_or_b32_e32 v8, s0, v8 +; GFX1132_DPP-NEXT: v_or_b32_e32 v7, s1, v7 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() - %old = atomicrmw max ptr addrspace(3) @local_var32, i32 %lane acq_rel + %lane_ext = zext i32 %lane to i64 + %old = atomicrmw or ptr addrspace(3) @local_var32, i64 %lane_ext acq_rel + store i64 %old, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { +; GFX7LESS_ITERATIVE-LABEL: xor_i32_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_xor_b32 s4, s4, s8 +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB19_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB19_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX7LESS_ITERATIVE-NEXT: v_xor_b32_e32 v0, s4, v1 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm +; +; GFX8_ITERATIVE-LABEL: xor_i32_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX8_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8_ITERATIVE-NEXT: s_xor_b32 s4, s4, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB19_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_xor_b32_e32 v0, s4, v1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm +; +; GFX9_ITERATIVE-LABEL: xor_i32_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX9_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9_ITERATIVE-NEXT: s_xor_b32 s4, s4, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX9_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB19_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_xor_b32_e32 v0, s4, v1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm +; +; GFX1064_ITERATIVE-LABEL: xor_i32_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1064_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_xor_b32 s4, s4, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB19_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: v_xor_b32_e32 v0, s2, v1 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm +; +; GFX1032_ITERATIVE-LABEL: xor_i32_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1032_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s0, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1032_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB19_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: v_xor_b32_e32 v0, s2, v1 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm +; +; GFX1164_ITERATIVE-LABEL: xor_i32_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s5 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: s_xor_b32 s4, s4, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_ITERATIVE-NEXT: ds_xor_rtn_b32 v1, v1, v2 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB19_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_xor_b32_e32 v0, s2, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm +; +; GFX1132_ITERATIVE-LABEL: xor_i32_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX1132_ITERATIVE-NEXT: ds_xor_rtn_b32 v1, v1, v2 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB19_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_xor_b32_e32 v0, s2, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm +; +; GFX7LESS_DPP-LABEL: xor_i32_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: ds_xor_rtn_b32 v0, v1, v0 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_DPP-NEXT: s_endpgm +; +; GFX8_DPP-LABEL: xor_i32_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB19_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_xor_rtn_b32 v0, v3, v0 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB19_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: xor_i32_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB19_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: ds_xor_rtn_b32 v0, v3, v0 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB19_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: xor_i32_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB19_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064_DPP-NEXT: ds_xor_rtn_b32 v0, v4, v0 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB19_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: xor_i32_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB19_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_DPP-NEXT: ds_xor_rtn_b32 v0, v4, v0 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB19_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: xor_i32_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB19_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164_DPP-NEXT: ds_xor_rtn_b32 v0, v4, v0 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB19_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: xor_i32_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB19_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132_DPP-NEXT: ds_xor_rtn_b32 v0, v4, v0 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB19_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = atomicrmw xor ptr addrspace(3) @local_var32, i32 %lane acq_rel store i32 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { +define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { +; GFX7LESS_ITERATIVE-LABEL: xor_i64_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[8:9], s[4:5], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[8:9] +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB20_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB20_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX7LESS_ITERATIVE-NEXT: v_xor_b32_e32 v2, s4, v2 +; GFX7LESS_ITERATIVE-NEXT: v_xor_b32_e32 v1, s5, v1 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm ; +; GFX8_ITERATIVE-LABEL: xor_i64_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB20_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_xor_b32_e32 v2, s4, v2 +; GFX8_ITERATIVE-NEXT: v_xor_b32_e32 v1, s5, v1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm ; -; GFX7LESS-LABEL: max_i64_constant: -; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB18_2 -; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 -; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB18_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 -; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc -; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 -; GFX7LESS-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] -; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 -; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX7LESS-NEXT: s_endpgm +; GFX9_ITERATIVE-LABEL: xor_i64_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB20_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_xor_b32_e32 v2, s4, v2 +; GFX9_ITERATIVE-NEXT: v_xor_b32_e32 v1, s5, v1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm ; -; GFX8-LABEL: max_i64_constant: -; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB18_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, 5 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB18_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: v_readfirstlane_b32 s5, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc -; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v2, s5 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX8-NEXT: s_endpgm +; GFX1064_ITERATIVE-LABEL: xor_i64_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[4:5] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB20_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1064_ITERATIVE-NEXT: v_xor_b32_e32 v2, s2, v2 +; GFX1064_ITERATIVE-NEXT: v_xor_b32_e32 v1, s3, v1 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm ; -; GFX9-LABEL: max_i64_constant: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB18_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, 5 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB18_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: v_readfirstlane_b32 s5, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc -; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX9-NEXT: s_endpgm +; GFX1032_ITERATIVE-LABEL: xor_i64_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s8 +; GFX1032_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB20_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1032_ITERATIVE-NEXT: v_xor_b32_e32 v2, s2, v2 +; GFX1032_ITERATIVE-NEXT: v_xor_b32_e32 v1, s3, v1 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm ; -; GFX1064-LABEL: max_i64_constant: -; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB18_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, 5 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB18_2: -; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc -; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc -; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] -; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc -; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX1064-NEXT: s_endpgm +; GFX1164_ITERATIVE-LABEL: xor_i64_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[8:9] +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1164_ITERATIVE-NEXT: ds_xor_rtn_b64 v[2:3], v4, v[2:3] +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB20_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_xor_b32_e32 v1, s2, v1 +; GFX1164_ITERATIVE-NEXT: v_xor_b32_e32 v0, s3, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm ; -; GFX1032-LABEL: max_i64_constant: -; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB18_2 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, 5 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB18_2: -; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo -; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo -; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] -; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo -; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX1032-NEXT: s_endpgm +; GFX1132_ITERATIVE-LABEL: xor_i64_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s5 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s8 +; GFX1132_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1132_ITERATIVE-NEXT: ds_xor_rtn_b64 v[2:3], v4, v[2:3] +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB20_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: v_xor_b32_e32 v1, s2, v1 +; GFX1132_ITERATIVE-NEXT: v_xor_b32_e32 v0, s3, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm ; -; GFX1164-LABEL: max_i64_constant: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB18_2 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, 5 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB18_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc -; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] -; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc -; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1164-NEXT: s_endpgm +; GFX7LESS_DPP-LABEL: xor_i64_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: ds_xor_rtn_b64 v[0:1], v1, v[0:1] +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7LESS_DPP-NEXT: s_endpgm ; -; GFX1132-LABEL: max_i64_constant: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB18_2 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v0, 5 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 -; GFX1132-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB18_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo -; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] -; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo -; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1132-NEXT: s_endpgm +; GFX8_DPP-LABEL: xor_i64_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB20_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_xor_rtn_b64 v[7:8], v6, v[7:8] +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB20_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_xor_b32_e32 v6, s4, v6 +; GFX8_DPP-NEXT: v_xor_b32_e32 v5, s5, v5 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: xor_i64_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB20_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX9_DPP-NEXT: ds_xor_rtn_b64 v[7:8], v6, v[7:8] +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB20_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_xor_b32_e32 v6, s4, v6 +; GFX9_DPP-NEXT: v_xor_b32_e32 v5, s5, v5 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: xor_i64_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s5 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s4, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s6, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s7, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s9, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s8, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB20_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1064_DPP-NEXT: ds_xor_rtn_b64 v[9:10], v8, v[9:10] +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB20_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v6 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_xor_b32_e32 v8, s0, v8 +; GFX1064_DPP-NEXT: v_xor_b32_e32 v7, s1, v7 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: xor_i64_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v6, s6, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB20_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1032_DPP-NEXT: ds_xor_rtn_b64 v[9:10], v8, v[9:10] +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB20_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v6 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_xor_b32_e32 v8, s0, v8 +; GFX1032_DPP-NEXT: v_xor_b32_e32 v7, s1, v7 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: xor_i64_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s4, 16 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s6, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB20_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1164_DPP-NEXT: ds_xor_rtn_b64 v[9:10], v8, v[9:10] +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB20_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v6 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_xor_b32_e32 v8, s0, v8 +; GFX1164_DPP-NEXT: v_xor_b32_e32 v7, s1, v7 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: xor_i64_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v6, s6, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB20_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0 +; GFX1132_DPP-NEXT: ds_xor_rtn_b64 v[9:10], v8, v[9:10] +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB20_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v6 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_xor_b32_e32 v8, s0, v8 +; GFX1132_DPP-NEXT: v_xor_b32_e32 v7, s1, v7 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm entry: - %old = atomicrmw max ptr addrspace(3) @local_var64, i64 5 acq_rel + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %lane_ext = zext i32 %lane to i64 + %old = atomicrmw xor ptr addrspace(3) @local_var32, i64 %lane_ext acq_rel store i64 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { +define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { +; GFX7LESS_ITERATIVE-LABEL: max_i32_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_brev_b32 s4, 1 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_max_i32 s4, s4, s8 +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB21_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB21_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX7LESS_ITERATIVE-NEXT: v_max_i32_e32 v0, s4, v1 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm ; +; GFX8_ITERATIVE-LABEL: max_i32_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_brev_b32 s4, 1 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX8_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8_ITERATIVE-NEXT: s_max_i32 s4, s4, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB21_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_max_i32_e32 v0, s4, v1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm ; -; GFX7LESS-LABEL: min_i32_varying: -; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_min_rtn_i32 v0, v1, v0 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX7LESS-NEXT: s_endpgm +; GFX9_ITERATIVE-LABEL: max_i32_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_brev_b32 s4, 1 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX9_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9_ITERATIVE-NEXT: s_max_i32 s4, s4, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX9_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB21_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_max_i32_e32 v0, s4, v1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm ; -; GFX8-LABEL: min_i32_varying: -; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: s_brev_b32 s4, -2 -; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: .LBB19_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX8-NEXT: s_mov_b32 m0, s5 -; GFX8-NEXT: v_readlane_b32 s8, v0, s5 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX8-NEXT: v_writelane_b32 v1, s4, m0 -; GFX8-NEXT: s_min_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX8-NEXT: s_cbranch_scc1 .LBB19_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8-NEXT: s_cbranch_execz .LBB19_4 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v2 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB19_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_min_i32_e32 v0, s4, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX8-NEXT: s_endpgm +; GFX1064_ITERATIVE-LABEL: max_i32_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_brev_b32 s4, 1 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1064_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_max_i32 s4, s4, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB21_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: v_max_i32_e32 v0, s2, v1 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm ; -; GFX9-LABEL: min_i32_varying: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: .LBB19_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX9-NEXT: s_mov_b32 m0, s5 -; GFX9-NEXT: v_readlane_b32 s8, v0, s5 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX9-NEXT: v_writelane_b32 v1, s4, m0 -; GFX9-NEXT: s_min_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_cbranch_scc1 .LBB19_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB19_4 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB19_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_min_i32_e32 v0, s4, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX9-NEXT: s_endpgm +; GFX1032_ITERATIVE-LABEL: max_i32_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1032_ITERATIVE-NEXT: s_brev_b32 s0, 1 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1032_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_max_i32 s0, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1032_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB21_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: v_max_i32_e32 v0, s2, v1 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm ; -; GFX1064-LABEL: min_i32_varying: -; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_brev_b32 s4, -2 -; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: .LBB19_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 -; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX1064-NEXT: s_min_i32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: s_cbranch_scc1 .LBB19_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB19_4 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: ds_min_rtn_i32 v0, v0, v2 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB19_4: -; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_min_i32_e32 v0, s2, v1 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX1064-NEXT: s_endpgm +; GFX1164_ITERATIVE-LABEL: max_i32_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_brev_b32 s4, 1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s5 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: s_max_i32 s4, s4, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_ITERATIVE-NEXT: ds_max_rtn_i32 v1, v1, v2 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB21_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_max_i32_e32 v0, s2, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm ; -; GFX1032-LABEL: min_i32_varying: -; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: s_brev_b32 s0, -2 -; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: .LBB19_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 -; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032-NEXT: s_min_i32 s0, s0, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1032-NEXT: s_cbranch_scc1 .LBB19_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032-NEXT: s_cbranch_execz .LBB19_4 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s0 -; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v2 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB19_4: -; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_min_i32_e32 v0, s2, v1 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX1032-NEXT: s_endpgm +; GFX1132_ITERATIVE-LABEL: max_i32_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1132_ITERATIVE-NEXT: s_brev_b32 s0, 1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: s_max_i32 s0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX1132_ITERATIVE-NEXT: ds_max_rtn_i32 v1, v1, v2 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB21_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_max_i32_e32 v0, s2, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm ; -; GFX1164-LABEL: min_i32_varying: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_brev_b32 s4, -2 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: .LBB19_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 -; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-NEXT: s_min_i32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: s_cbranch_scc1 .LBB19_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB19_4 -; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_min_rtn_i32 v1, v1, v2 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB19_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_min_i32_e32 v0, s2, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1164-NEXT: s_endpgm +; GFX7LESS_DPP-LABEL: max_i32_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: ds_max_rtn_i32 v0, v1, v0 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_DPP-NEXT: s_endpgm ; -; GFX1132-LABEL: min_i32_varying: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_brev_b32 s0, -2 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: .LBB19_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 -; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 -; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: s_min_i32 s0, s0, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1132-NEXT: s_cbranch_scc1 .LBB19_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132-NEXT: s_cbranch_execz .LBB19_4 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX1132-NEXT: ds_min_rtn_i32 v1, v1, v2 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB19_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_min_i32_e32 v0, s2, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1132-NEXT: s_endpgm +; GFX8_DPP-LABEL: max_i32_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB21_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_max_rtn_i32 v0, v0, v3 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB21_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_max_i32_e32 v0, s4, v0 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: max_i32_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB21_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX9_DPP-NEXT: ds_max_rtn_i32 v0, v0, v3 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB21_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_max_i32_e32 v0, s4, v0 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: max_i32_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB21_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1064_DPP-NEXT: ds_max_rtn_i32 v0, v0, v4 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB21_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_max_i32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: max_i32_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB21_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032_DPP-NEXT: ds_max_rtn_i32 v0, v0, v4 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB21_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_max_i32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: max_i32_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB21_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1164_DPP-NEXT: ds_max_rtn_i32 v0, v0, v4 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB21_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_max_i32_e32 v0, s0, v0 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: max_i32_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB21_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132_DPP-NEXT: ds_max_rtn_i32 v0, v0, v4 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB21_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_max_i32_e32 v0, s0, v0 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() - %old = atomicrmw min ptr addrspace(3) @local_var32, i32 %lane acq_rel + %old = atomicrmw max ptr addrspace(3) @local_var32, i32 %lane acq_rel store i32 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { -; -; -; GFX7LESS-LABEL: min_i64_constant: +define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { +; GFX7LESS-LABEL: max_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB20_2 +; GFX7LESS-NEXT: s_cbranch_execz .LBB22_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] +; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB20_2: +; GFX7LESS-NEXT: .LBB22_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 -; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc +; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc ; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 -; GFX7LESS-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] +; GFX7LESS-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] ; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 ; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc @@ -4760,30 +11673,30 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; -; GFX8-LABEL: min_i64_constant: +; GFX8-LABEL: max_i64_constant: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB20_2 +; GFX8-NEXT: s_cbranch_execz .LBB22_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] +; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB20_2: +; GFX8-NEXT: .LBB22_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 +; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc +; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, s4 @@ -4794,29 +11707,29 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; -; GFX9-LABEL: min_i64_constant: +; GFX9-LABEL: max_i64_constant: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-NEXT: s_cbranch_execz .LBB22_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] +; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB20_2: +; GFX9-NEXT: .LBB22_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 +; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, s4 @@ -4827,31 +11740,31 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; -; GFX1064-LABEL: min_i64_constant: +; GFX1064-LABEL: max_i64_constant: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB20_2 +; GFX1064-NEXT: s_cbranch_execz .LBB22_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] +; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB20_2: +; GFX1064-NEXT: .LBB22_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc -; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc -; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] +; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc +; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc +; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] ; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc ; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 @@ -4860,30 +11773,30 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; -; GFX1032-LABEL: min_i64_constant: +; GFX1032-LABEL: max_i64_constant: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB20_2 +; GFX1032-NEXT: s_cbranch_execz .LBB22_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] +; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB20_2: +; GFX1032-NEXT: .LBB22_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo -; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo -; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] +; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo +; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo +; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] ; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo ; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 @@ -4892,7 +11805,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; -; GFX1164-LABEL: min_i64_constant: +; GFX1164-LABEL: max_i64_constant: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -4900,23 +11813,23 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB20_2 +; GFX1164-NEXT: s_cbranch_execz .LBB22_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v0, 5 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] +; GFX1164-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB20_2: +; GFX1164-NEXT: .LBB22_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc -; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc +; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc +; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] +; GFX1164-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] ; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc ; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 @@ -4927,29 +11840,29 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; -; GFX1132-LABEL: min_i64_constant: +; GFX1132-LABEL: max_i64_constant: ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB20_2 +; GFX1132-NEXT: s_cbranch_execz .LBB22_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 ; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 -; GFX1132-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] +; GFX1132-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB20_2: +; GFX1132-NEXT: .LBB22_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo -; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo +; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo +; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] +; GFX1132-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] ; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo ; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 @@ -4960,283 +11873,3632 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: - %old = atomicrmw min ptr addrspace(3) @local_var64, i64 5 acq_rel + %old = atomicrmw max ptr addrspace(3) @local_var64, i64 5 acq_rel store i64 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { +define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { +; GFX7LESS_ITERATIVE-LABEL: max_i64_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX7LESS_ITERATIVE-NEXT: s_brev_b32 s1, 1 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 +; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[4:5], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB23_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB23_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[1:2] +; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 +; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm ; +; GFX8_ITERATIVE-LABEL: max_i64_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_ITERATIVE-NEXT: s_brev_b32 s1, 1 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 +; GFX8_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[4:5] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX8_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX8_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 +; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB23_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 +; GFX8_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[1:2] +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5 +; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm ; -; GFX7LESS-LABEL: umax_i32_varying: +; GFX9_ITERATIVE-LABEL: max_i64_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_ITERATIVE-NEXT: s_brev_b32 s1, 1 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 +; GFX9_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[4:5] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX9_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX9_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 +; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB23_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 +; GFX9_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[1:2] +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5 +; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm +; +; GFX1064_ITERATIVE-LABEL: max_i64_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1064_ITERATIVE-NEXT: s_brev_b32 s1, 1 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[4:5] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 +; GFX1064_ITERATIVE-NEXT: v_cmp_gt_i64_e64 s[8:9], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB23_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1064_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[1:2] +; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc +; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm +; +; GFX1032_ITERATIVE-LABEL: max_i64_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1032_ITERATIVE-NEXT: s_brev_b32 s1, 1 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 +; GFX1032_ITERATIVE-NEXT: v_cmp_gt_i64_e64 s8, s[0:1], s[6:7] +; GFX1032_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo +; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB23_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1032_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[1:2] +; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo +; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm +; +; GFX1164_ITERATIVE-LABEL: max_i64_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_ITERATIVE-NEXT: s_brev_b32 s1, 1 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: .p2align 6 +; GFX1164_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 +; GFX1164_ITERATIVE-NEXT: v_cmp_gt_i64_e64 s[8:9], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1164_ITERATIVE-NEXT: ds_max_rtn_i64 v[2:3], v4, v[2:3] +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB23_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] +; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc +; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm +; +; GFX1132_ITERATIVE-LABEL: max_i64_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_ITERATIVE-NEXT: s_brev_b32 s1, 1 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: .p2align 6 +; GFX1132_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s5 +; GFX1132_ITERATIVE-NEXT: v_cmp_gt_i64_e64 s8, s[0:1], s[6:7] +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo +; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s5 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1132_ITERATIVE-NEXT: ds_max_rtn_i64 v[2:3], v4, v[2:3] +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB23_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] +; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo +; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm +; +; GFX7LESS_DPP-LABEL: max_i64_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: ds_max_rtn_i64 v[0:1], v1, v[0:1] +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7LESS_DPP-NEXT: s_endpgm +; +; GFX8_DPP-LABEL: max_i64_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX8_DPP-NEXT: s_mov_b32 s0, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX8_DPP-NEXT: s_brev_b32 s1, 1 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, s1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_readlane_b32 s1, v2, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s0, v1, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX8_DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB23_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_max_rtn_i64 v[9:10], v8, v[9:10] +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB23_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v10 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8] +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s5 +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: max_i64_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX9_DPP-NEXT: s_mov_b32 s0, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9_DPP-NEXT: s_brev_b32 s1, 1 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, s1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_readlane_b32 s1, v2, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s0, v1, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX9_DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB23_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX9_DPP-NEXT: ds_max_rtn_i64 v[9:10], v8, v[9:10] +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB23_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v10 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8] +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s5 +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: max_i64_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1064_DPP-NEXT: s_mov_b32 s0, 0 +; GFX1064_DPP-NEXT: s_brev_b32 s1, 1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s1 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 15 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s4, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s5, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v4, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v3, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB23_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1064_DPP-NEXT: ds_max_rtn_i64 v[11:12], v10, v[11:12] +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB23_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[9:10] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: max_i64_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1032_DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032_DPP-NEXT: s_brev_b32 s1, 1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s1 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 15 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB23_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1032_DPP-NEXT: ds_max_rtn_i64 v[11:12], v10, v[11:12] +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB23_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[9:10] +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc_lo +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: max_i64_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1164_DPP-NEXT: s_mov_b32 s0, 0 +; GFX1164_DPP-NEXT: s_brev_b32 s1, 1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v10 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, s0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s1 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[3:4] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s4, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s5, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s6, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s7, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s8, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s9, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB23_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1164_DPP-NEXT: ds_max_rtn_i64 v[11:12], v10, v[11:12] +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB23_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[9:10] +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_store_b64 v[9:10], off, s[4:7], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: max_i64_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1132_DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132_DPP-NEXT: s_brev_b32 s1, 1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v10 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, s0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, s1 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v7, v1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[5:6], v[3:4] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v3, v5 +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_mov_b32 v5, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_mov_b32 v7, v1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v6, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB23_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0 +; GFX1132_DPP-NEXT: ds_max_rtn_i64 v[11:12], v10, v[11:12] +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB23_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[9:10] +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc_lo +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc_lo +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_store_b64 v[9:10], off, s[4:7], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %lane_ext = zext i32 %lane to i64 + %old = atomicrmw max ptr addrspace(3) @local_var32, i64 %lane_ext acq_rel + store i64 %old, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { +; GFX7LESS_ITERATIVE-LABEL: min_i32_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_brev_b32 s4, -2 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_min_i32 s4, s4, s8 +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB24_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB24_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX7LESS_ITERATIVE-NEXT: v_min_i32_e32 v0, s4, v1 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm +; +; GFX8_ITERATIVE-LABEL: min_i32_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_brev_b32 s4, -2 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX8_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8_ITERATIVE-NEXT: s_min_i32 s4, s4, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB24_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_min_i32_e32 v0, s4, v1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm +; +; GFX9_ITERATIVE-LABEL: min_i32_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_brev_b32 s4, -2 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX9_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9_ITERATIVE-NEXT: s_min_i32 s4, s4, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX9_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB24_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_min_i32_e32 v0, s4, v1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm +; +; GFX1064_ITERATIVE-LABEL: min_i32_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_brev_b32 s4, -2 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1064_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_min_i32 s4, s4, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB24_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: v_min_i32_e32 v0, s2, v1 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm +; +; GFX1032_ITERATIVE-LABEL: min_i32_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1032_ITERATIVE-NEXT: s_brev_b32 s0, -2 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1032_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_min_i32 s0, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1032_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB24_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: v_min_i32_e32 v0, s2, v1 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm +; +; GFX1164_ITERATIVE-LABEL: min_i32_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_brev_b32 s4, -2 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s5 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: s_min_i32 s4, s4, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_ITERATIVE-NEXT: ds_min_rtn_i32 v1, v1, v2 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB24_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_min_i32_e32 v0, s2, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm +; +; GFX1132_ITERATIVE-LABEL: min_i32_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1132_ITERATIVE-NEXT: s_brev_b32 s0, -2 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: s_min_i32 s0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX1132_ITERATIVE-NEXT: ds_min_rtn_i32 v1, v1, v2 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB24_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_min_i32_e32 v0, s2, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm +; +; GFX7LESS_DPP-LABEL: min_i32_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: ds_min_rtn_i32 v0, v1, v0 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_DPP-NEXT: s_endpgm +; +; GFX8_DPP-LABEL: min_i32_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, -2 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB24_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_min_rtn_i32 v0, v0, v3 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB24_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_min_i32_e32 v0, s4, v0 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: min_i32_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, -2 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB24_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX9_DPP-NEXT: ds_min_rtn_i32 v0, v0, v3 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB24_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_min_i32_e32 v0, s4, v0 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: min_i32_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v3, -2 +; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB24_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1064_DPP-NEXT: ds_min_rtn_i32 v0, v0, v4 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB24_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_min_i32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: min_i32_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v3, -2 +; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB24_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032_DPP-NEXT: ds_min_rtn_i32 v0, v0, v4 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB24_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_min_i32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: min_i32_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v3, -2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB24_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1164_DPP-NEXT: ds_min_rtn_i32 v0, v0, v4 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB24_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_min_i32_e32 v0, s0, v0 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: min_i32_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v3, -2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB24_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132_DPP-NEXT: ds_min_rtn_i32 v0, v0, v4 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB24_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_min_i32_e32 v0, s0, v0 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = atomicrmw min ptr addrspace(3) @local_var32, i32 %lane acq_rel + store i32 %old, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { +; GFX7LESS-LABEL: min_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB25_2 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 +; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_max_rtn_u32 v0, v1, v0 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: .LBB25_2: +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 +; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc +; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 +; GFX7LESS-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] +; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; -; GFX8-LABEL: umax_i32_varying: +; GFX8-LABEL: min_i64_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: s_mov_b32 s4, 0 -; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: .LBB21_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX8-NEXT: s_mov_b32 m0, s5 -; GFX8-NEXT: v_readlane_b32 s8, v0, s5 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX8-NEXT: v_writelane_b32 v1, s4, m0 -; GFX8-NEXT: s_max_u32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX8-NEXT: s_cbranch_scc1 .LBB21_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8-NEXT: s_cbranch_execz .LBB21_4 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_cbranch_execz .LBB25_2 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: v_mov_b32_e32 v0, 5 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v2 +; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB21_4: +; GFX8-NEXT: .LBB25_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: v_readfirstlane_b32 s5, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_max_u32_e32 v0, s4, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; -; GFX9-LABEL: umax_i32_varying: +; GFX9-LABEL: min_i64_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: .LBB21_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX9-NEXT: s_mov_b32 m0, s5 -; GFX9-NEXT: v_readlane_b32 s8, v0, s5 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX9-NEXT: v_writelane_b32 v1, s4, m0 -; GFX9-NEXT: s_max_u32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_cbranch_scc1 .LBB21_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB21_4 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v2 +; GFX9-NEXT: s_cbranch_execz .LBB25_2 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: v_mov_b32_e32 v0, 5 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: .LBB25_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: v_readfirstlane_b32 s5, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_max_u32_e32 v0, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; -; GFX1064-LABEL: umax_i32_varying: +; GFX1064-LABEL: min_i64_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s4, 0 -; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: .LBB21_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 -; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX1064-NEXT: s_max_u32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: s_cbranch_scc1 .LBB21_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0 +; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB21_4 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: ds_max_rtn_u32 v0, v0, v2 +; GFX1064-NEXT: s_cbranch_execz .LBB25_2 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: v_mov_b32_e32 v0, 5 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB21_4: +; GFX1064-NEXT: .LBB25_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc +; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc +; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] +; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc +; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_max_u32_e32 v0, s2, v1 ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; -; GFX1032-LABEL: umax_i32_varying: +; GFX1032-LABEL: min_i64_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: .LBB21_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 -; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032-NEXT: s_max_u32 s0, s0, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1032-NEXT: s_cbranch_scc1 .LBB21_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032-NEXT: s_cbranch_execz .LBB21_4 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s0 -; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v2 +; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB25_2 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: v_mov_b32_e32 v0, 5 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB21_4: +; GFX1032-NEXT: .LBB25_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo +; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo +; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] +; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo +; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_max_u32_e32 v0, s2, v1 ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; -; GFX1164-LABEL: umax_i32_varying: +; GFX1164-LABEL: min_i64_constant: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_mov_b32 s4, 0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: .LBB21_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 -; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-NEXT: s_max_u32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: s_cbranch_scc1 .LBB21_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB21_4 -; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_cbranch_execz .LBB25_2 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: v_mov_b32_e32 v0, 5 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_max_rtn_u32 v1, v1, v2 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB21_4: +; GFX1164-NEXT: .LBB25_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc +; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_u32_e32 v0, s2, v0 +; GFX1164-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] +; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc +; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc +; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; -; GFX1132-LABEL: umax_i32_varying: +; GFX1132-LABEL: min_i64_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: .LBB21_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 -; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 -; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: s_max_u32 s0, s0, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1132-NEXT: s_cbranch_scc1 .LBB21_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132-NEXT: s_cbranch_execz .LBB21_4 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX1132-NEXT: ds_max_rtn_u32 v1, v1, v2 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1132-NEXT: s_cbranch_execz .LBB25_2 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: v_mov_b32_e32 v0, 5 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX1132-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB21_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132-NEXT: .LBB25_2: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo +; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_u32_e32 v0, s2, v0 +; GFX1132-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] +; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo +; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm +entry: + %old = atomicrmw min ptr addrspace(3) @local_var64, i64 5 acq_rel + store i64 %old, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { +; GFX7LESS_ITERATIVE-LABEL: min_i64_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX7LESS_ITERATIVE-NEXT: s_brev_b32 s1, -2 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s0, -1 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 +; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[4:5], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB26_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB26_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[1:2] +; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 +; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm +; +; GFX8_ITERATIVE-LABEL: min_i64_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_ITERATIVE-NEXT: s_brev_b32 s1, -2 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s0, -1 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 +; GFX8_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX8_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX8_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 +; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB26_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 +; GFX8_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[1:2] +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5 +; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm +; +; GFX9_ITERATIVE-LABEL: min_i64_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_ITERATIVE-NEXT: s_brev_b32 s1, -2 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s0, -1 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 +; GFX9_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX9_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX9_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 +; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB26_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 +; GFX9_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[1:2] +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5 +; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm +; +; GFX1064_ITERATIVE-LABEL: min_i64_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1064_ITERATIVE-NEXT: s_brev_b32 s1, -2 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s0, -1 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[4:5] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 +; GFX1064_ITERATIVE-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB26_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1064_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2] +; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc +; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm +; +; GFX1032_ITERATIVE-LABEL: min_i64_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1032_ITERATIVE-NEXT: s_brev_b32 s1, -2 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, -1 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 +; GFX1032_ITERATIVE-NEXT: v_cmp_lt_i64_e64 s8, s[0:1], s[6:7] +; GFX1032_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo +; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB26_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1032_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[1:2] +; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo +; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm +; +; GFX1164_ITERATIVE-LABEL: min_i64_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_ITERATIVE-NEXT: s_brev_b32 s1, -2 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s0, -1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: .p2align 6 +; GFX1164_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 +; GFX1164_ITERATIVE-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1164_ITERATIVE-NEXT: ds_min_rtn_i64 v[2:3], v4, v[2:3] +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB26_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] +; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc +; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm +; +; GFX1132_ITERATIVE-LABEL: min_i64_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_ITERATIVE-NEXT: s_brev_b32 s1, -2 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, -1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: .p2align 6 +; GFX1132_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s5 +; GFX1132_ITERATIVE-NEXT: v_cmp_lt_i64_e64 s8, s[0:1], s[6:7] +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo +; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s5 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1132_ITERATIVE-NEXT: ds_min_rtn_i64 v[2:3], v4, v[2:3] +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB26_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] +; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo +; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm +; +; GFX7LESS_DPP-LABEL: min_i64_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: ds_min_rtn_i64 v[0:1], v1, v[0:1] +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7LESS_DPP-NEXT: s_endpgm +; +; GFX8_DPP-LABEL: min_i64_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_DPP-NEXT: s_mov_b32 s6, -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: s_brev_b32 s7, -2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, s7 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s6 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, s7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_readlane_b32 s5, v2, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB26_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_min_rtn_i64 v[9:10], v8, v[9:10] +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB26_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX8_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s0, v9 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[7:8] +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s1 +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX8_DPP-NEXT: s_mov_b32 s7, 0xf000 +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: min_i64_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_DPP-NEXT: s_mov_b32 s6, -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: s_brev_b32 s7, -2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, s7 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s6 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, s7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_readlane_b32 s5, v2, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB26_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX9_DPP-NEXT: ds_min_rtn_i64 v[9:10], v8, v[9:10] +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB26_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s0, v9 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[7:8] +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s1 +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX9_DPP-NEXT: s_mov_b32 s7, 0xf000 +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: min_i64_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1064_DPP-NEXT: s_brev_b32 s7, -2 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s7 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s6 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s7 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 15 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v3, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s4, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s5, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s10, v4, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s11, v3, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s7, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s8, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s10, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s11, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[8:9] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB26_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1064_DPP-NEXT: ds_min_rtn_i64 v[11:12], v10, v[11:12] +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB26_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[9:10] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: min_i64_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1032_DPP-NEXT: s_brev_b32 s7, -2 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, s7 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, s6 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s7 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 15 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB26_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1032_DPP-NEXT: ds_min_rtn_i64 v[11:12], v10, v[11:12] +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB26_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[0:1], v[9:10] +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc_lo +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: min_i64_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_brev_b32 s7, -2 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s7 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v10 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, s6 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s7 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[5:6], v[3:4] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v3, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s4, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s5, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s10, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s11, v3, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s10, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s11, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[8:9] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB26_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1164_DPP-NEXT: ds_min_rtn_i64 v[11:12], v10, v[11:12] +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB26_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[9:10] +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_store_b64 v[9:10], off, s[4:7], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: min_i64_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_brev_b32 s7, -2 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v10 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, s6 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, s7 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v7, v1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[5:6], v[3:4] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v3, v5 +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_mov_b32 v5, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_mov_b32 v7, v1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v6, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB26_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0 +; GFX1132_DPP-NEXT: ds_min_rtn_i64 v[11:12], v10, v[11:12] +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB26_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[0:1], v[9:10] +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc_lo +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc_lo +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_store_b64 v[9:10], off, s[4:7], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %lane_ext = zext i32 %lane to i64 + %old = atomicrmw min ptr addrspace(3) @local_var32, i64 %lane_ext acq_rel + store i64 %old, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { +; GFX7LESS_ITERATIVE-LABEL: umax_i32_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_max_u32 s4, s4, s8 +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB27_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB27_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX7LESS_ITERATIVE-NEXT: v_max_u32_e32 v0, s4, v1 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm +; +; GFX8_ITERATIVE-LABEL: umax_i32_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX8_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8_ITERATIVE-NEXT: s_max_u32 s4, s4, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB27_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_max_u32_e32 v0, s4, v1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm +; +; GFX9_ITERATIVE-LABEL: umax_i32_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX9_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9_ITERATIVE-NEXT: s_max_u32 s4, s4, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX9_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB27_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_max_u32_e32 v0, s4, v1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm +; +; GFX1064_ITERATIVE-LABEL: umax_i32_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1064_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_max_u32 s4, s4, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB27_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: v_max_u32_e32 v0, s2, v1 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm +; +; GFX1032_ITERATIVE-LABEL: umax_i32_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1032_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_max_u32 s0, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1032_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB27_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: v_max_u32_e32 v0, s2, v1 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm +; +; GFX1164_ITERATIVE-LABEL: umax_i32_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s5 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: s_max_u32 s4, s4, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_ITERATIVE-NEXT: ds_max_rtn_u32 v1, v1, v2 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB27_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_max_u32_e32 v0, s2, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm +; +; GFX1132_ITERATIVE-LABEL: umax_i32_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: s_max_u32 s0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX1132_ITERATIVE-NEXT: ds_max_rtn_u32 v1, v1, v2 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB27_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_max_u32_e32 v0, s2, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm +; +; GFX7LESS_DPP-LABEL: umax_i32_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: ds_max_rtn_u32 v0, v1, v0 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_DPP-NEXT: s_endpgm +; +; GFX8_DPP-LABEL: umax_i32_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB27_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_max_rtn_u32 v0, v3, v0 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB27_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_max_u32_e32 v0, s4, v0 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: umax_i32_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB27_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: ds_max_rtn_u32 v0, v3, v0 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB27_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_max_u32_e32 v0, s4, v0 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: umax_i32_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB27_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064_DPP-NEXT: ds_max_rtn_u32 v0, v4, v0 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB27_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_max_u32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: umax_i32_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB27_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_DPP-NEXT: ds_max_rtn_u32 v0, v4, v0 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB27_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_max_u32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: umax_i32_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB27_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164_DPP-NEXT: ds_max_rtn_u32 v0, v4, v0 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB27_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_max_u32_e32 v0, s0, v0 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: umax_i32_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB27_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132_DPP-NEXT: ds_max_rtn_u32 v0, v4, v0 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB27_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_max_u32_e32 v0, s0, v0 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = atomicrmw umax ptr addrspace(3) @local_var32, i32 %lane acq_rel @@ -5245,8 +15507,6 @@ entry: } define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { -; -; ; GFX7LESS-LABEL: umax_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -5254,7 +15514,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB22_2 +; GFX7LESS-NEXT: s_cbranch_execz .LBB28_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -5262,7 +15522,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB22_2: +; GFX7LESS-NEXT: .LBB28_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -5287,7 +15547,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB22_2 +; GFX8-NEXT: s_cbranch_execz .LBB28_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -5295,7 +15555,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB22_2: +; GFX8-NEXT: .LBB28_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 @@ -5320,14 +15580,14 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: s_cbranch_execz .LBB28_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB22_2: +; GFX9-NEXT: .LBB28_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 @@ -5352,7 +15612,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB22_2 +; GFX1064-NEXT: s_cbranch_execz .LBB28_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -5360,7 +15620,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB22_2: +; GFX1064-NEXT: .LBB28_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 @@ -5384,7 +15644,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB22_2 +; GFX1032-NEXT: s_cbranch_execz .LBB28_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -5392,7 +15652,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB22_2: +; GFX1032-NEXT: .LBB28_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 @@ -5418,7 +15678,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB22_2 +; GFX1164-NEXT: s_cbranch_execz .LBB28_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v0, 5 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -5426,7 +15686,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB22_2: +; GFX1164-NEXT: .LBB28_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 @@ -5452,14 +15712,14 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB22_2 +; GFX1132-NEXT: s_cbranch_execz .LBB28_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 ; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX1132-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB22_2: +; GFX1132-NEXT: .LBB28_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 @@ -5483,278 +15743,1682 @@ entry: ret void } +define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { +; GFX7LESS_ITERATIVE-LABEL: umax_i64_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 +; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[4:5], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB29_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB29_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[1:2] +; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 +; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm +; +; GFX8_ITERATIVE-LABEL: umax_i64_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 +; GFX8_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX8_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX8_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 +; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB29_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 +; GFX8_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[1:2] +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5 +; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm +; +; GFX9_ITERATIVE-LABEL: umax_i64_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 +; GFX9_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX9_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX9_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 +; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB29_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 +; GFX9_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[1:2] +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5 +; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm +; +; GFX1064_ITERATIVE-LABEL: umax_i64_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[4:5] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 +; GFX1064_ITERATIVE-NEXT: v_cmp_gt_u64_e64 s[8:9], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB29_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1064_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[1:2] +; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc +; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm +; +; GFX1032_ITERATIVE-LABEL: umax_i64_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 +; GFX1032_ITERATIVE-NEXT: v_cmp_gt_u64_e64 s8, s[0:1], s[6:7] +; GFX1032_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo +; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB29_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1032_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[1:2] +; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo +; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm +; +; GFX1164_ITERATIVE-LABEL: umax_i64_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: .p2align 6 +; GFX1164_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_cmp_gt_u64_e64 s[8:9], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1164_ITERATIVE-NEXT: ds_max_rtn_u64 v[2:3], v4, v[2:3] +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB29_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] +; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc +; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm +; +; GFX1132_ITERATIVE-LABEL: umax_i64_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: .p2align 6 +; GFX1132_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_gt_u64_e64 s8, s[0:1], s[6:7] +; GFX1132_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo +; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1132_ITERATIVE-NEXT: ds_max_rtn_u64 v[2:3], v4, v[2:3] +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB29_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo +; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm +; +; GFX7LESS_DPP-LABEL: umax_i64_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: ds_max_rtn_u64 v[0:1], v1, v[0:1] +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7LESS_DPP-NEXT: s_endpgm +; +; GFX8_DPP-LABEL: umax_i64_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB29_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_max_rtn_u64 v[9:10], v8, v[9:10] +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB29_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v10 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[7:8] +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s5 +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: umax_i64_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB29_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX9_DPP-NEXT: ds_max_rtn_u64 v[9:10], v8, v[9:10] +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB29_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v10 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[7:8] +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s5 +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: umax_i64_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 15 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s4, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s5, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v4, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v3, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB29_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1064_DPP-NEXT: ds_max_rtn_u64 v[11:12], v10, v[11:12] +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB29_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[9:10] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: umax_i64_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 15 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB29_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1032_DPP-NEXT: ds_max_rtn_u64 v[11:12], v10, v[11:12] +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB29_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[9:10] +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc_lo +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: umax_i64_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v10 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[3:4] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s4, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s5, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s6, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s7, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s8, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s9, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB29_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1164_DPP-NEXT: ds_max_rtn_u64 v[11:12], v10, v[11:12] +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB29_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[9:10] +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_store_b64 v[9:10], off, s[4:7], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: umax_i64_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_and_b32 v9, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v10 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v7, v1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[5:6], v[3:4] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v3, v5 +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_mov_b32 v5, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_mov_b32 v7, v1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v6, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB29_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0 +; GFX1132_DPP-NEXT: ds_max_rtn_u64 v[11:12], v10, v[11:12] +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB29_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[9:10] +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc_lo +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc_lo +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_store_b64 v[9:10], off, s[4:7], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %lane_ext = zext i32 %lane to i64 + %old = atomicrmw umax ptr addrspace(3) @local_var32, i64 %lane_ext acq_rel + store i64 %old, ptr addrspace(1) %out + ret void +} + define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { +; GFX7LESS_ITERATIVE-LABEL: umin_i32_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s4, -1 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_min_u32 s4, s4, s8 +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB30_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB30_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX7LESS_ITERATIVE-NEXT: v_min_u32_e32 v0, s4, v1 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm ; +; GFX8_ITERATIVE-LABEL: umin_i32_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_mov_b32 s4, -1 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX8_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8_ITERATIVE-NEXT: s_min_u32 s4, s4, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB30_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_min_u32_e32 v0, s4, v1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm ; -; GFX7LESS-LABEL: umin_i32_varying: -; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_min_rtn_u32 v0, v1, v0 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX7LESS-NEXT: s_endpgm +; GFX9_ITERATIVE-LABEL: umin_i32_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b32 s4, -1 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX9_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9_ITERATIVE-NEXT: s_min_u32 s4, s4, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX9_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB30_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_min_u32_e32 v0, s4, v1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm ; -; GFX8-LABEL: umin_i32_varying: -; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: s_mov_b32 s4, -1 -; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: .LBB23_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX8-NEXT: s_mov_b32 m0, s5 -; GFX8-NEXT: v_readlane_b32 s8, v0, s5 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX8-NEXT: v_writelane_b32 v1, s4, m0 -; GFX8-NEXT: s_min_u32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX8-NEXT: s_cbranch_scc1 .LBB23_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8-NEXT: s_cbranch_execz .LBB23_4 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v2 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB23_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_min_u32_e32 v0, s4, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX8-NEXT: s_endpgm +; GFX1064_ITERATIVE-LABEL: umin_i32_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s4, -1 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1064_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_min_u32 s4, s4, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB30_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: v_min_u32_e32 v0, s2, v1 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm ; -; GFX9-LABEL: umin_i32_varying: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s4, -1 -; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: .LBB23_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX9-NEXT: s_mov_b32 m0, s5 -; GFX9-NEXT: v_readlane_b32 s8, v0, s5 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX9-NEXT: v_writelane_b32 v1, s4, m0 -; GFX9-NEXT: s_min_u32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_cbranch_scc1 .LBB23_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB23_4 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB23_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_min_u32_e32 v0, s4, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX9-NEXT: s_endpgm +; GFX1032_ITERATIVE-LABEL: umin_i32_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, -1 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1032_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_min_u32 s0, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1032_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB30_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: v_min_u32_e32 v0, s2, v1 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm ; -; GFX1064-LABEL: umin_i32_varying: -; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s4, -1 -; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: .LBB23_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 -; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX1064-NEXT: s_min_u32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: s_cbranch_scc1 .LBB23_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB23_4 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: ds_min_rtn_u32 v0, v0, v2 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB23_4: -; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_min_u32_e32 v0, s2, v1 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX1064-NEXT: s_endpgm +; GFX1164_ITERATIVE-LABEL: umin_i32_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s4, -1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s5 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: s_min_u32 s4, s4, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_ITERATIVE-NEXT: ds_min_rtn_u32 v1, v1, v2 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB30_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_min_u32_e32 v0, s2, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm ; -; GFX1032-LABEL: umin_i32_varying: -; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: s_mov_b32 s0, -1 -; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: .LBB23_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 -; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032-NEXT: s_min_u32 s0, s0, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1032-NEXT: s_cbranch_scc1 .LBB23_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032-NEXT: s_cbranch_execz .LBB23_4 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s0 -; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v2 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB23_4: -; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_min_u32_e32 v0, s2, v1 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX1032-NEXT: s_endpgm +; GFX1132_ITERATIVE-LABEL: umin_i32_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, -1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: s_min_u32 s0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX1132_ITERATIVE-NEXT: ds_min_rtn_u32 v1, v1, v2 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB30_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_min_u32_e32 v0, s2, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm ; -; GFX1164-LABEL: umin_i32_varying: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_mov_b32 s4, -1 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: .LBB23_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 -; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-NEXT: s_min_u32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: s_cbranch_scc1 .LBB23_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB23_4 -; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_min_rtn_u32 v1, v1, v2 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB23_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_min_u32_e32 v0, s2, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1164-NEXT: s_endpgm +; GFX7LESS_DPP-LABEL: umin_i32_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: ds_min_rtn_u32 v0, v1, v0 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_DPP-NEXT: s_endpgm ; -; GFX1132-LABEL: umin_i32_varying: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_mov_b32 s0, -1 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: .LBB23_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 -; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 -; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: s_min_u32 s0, s0, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1132-NEXT: s_cbranch_scc1 .LBB23_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132-NEXT: s_cbranch_execz .LBB23_4 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX1132-NEXT: ds_min_rtn_u32 v1, v1, v2 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB23_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_min_u32_e32 v0, s2, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1132-NEXT: s_endpgm +; GFX8_DPP-LABEL: umin_i32_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB30_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_min_rtn_u32 v0, v0, v3 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB30_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_min_u32_e32 v0, s4, v0 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: umin_i32_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB30_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX9_DPP-NEXT: ds_min_rtn_u32 v0, v0, v3 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB30_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_min_u32_e32 v0, s4, v0 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: umin_i32_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB30_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1064_DPP-NEXT: ds_min_rtn_u32 v0, v0, v4 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB30_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_min_u32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: umin_i32_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB30_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032_DPP-NEXT: ds_min_rtn_u32 v0, v0, v4 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB30_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_min_u32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: umin_i32_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB30_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1164_DPP-NEXT: ds_min_rtn_u32 v0, v0, v4 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB30_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_min_u32_e32 v0, s0, v0 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: umin_i32_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB30_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132_DPP-NEXT: ds_min_rtn_u32 v0, v0, v4 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB30_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_min_u32_e32 v0, s0, v0 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = atomicrmw umin ptr addrspace(3) @local_var32, i32 %lane acq_rel @@ -5763,8 +17427,6 @@ entry: } define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { -; -; ; GFX7LESS-LABEL: umin_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -5772,7 +17434,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB24_2 +; GFX7LESS-NEXT: s_cbranch_execz .LBB31_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -5780,7 +17442,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB24_2: +; GFX7LESS-NEXT: .LBB31_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -5805,7 +17467,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB24_2 +; GFX8-NEXT: s_cbranch_execz .LBB31_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -5813,7 +17475,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB24_2: +; GFX8-NEXT: .LBB31_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 @@ -5838,14 +17500,14 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: s_cbranch_execz .LBB31_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB24_2: +; GFX9-NEXT: .LBB31_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 @@ -5870,7 +17532,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB24_2 +; GFX1064-NEXT: s_cbranch_execz .LBB31_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -5878,7 +17540,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB24_2: +; GFX1064-NEXT: .LBB31_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 @@ -5902,7 +17564,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB24_2 +; GFX1032-NEXT: s_cbranch_execz .LBB31_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -5910,7 +17572,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB24_2: +; GFX1032-NEXT: .LBB31_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 @@ -5936,7 +17598,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB24_2 +; GFX1164-NEXT: s_cbranch_execz .LBB31_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v0, 5 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -5944,7 +17606,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB24_2: +; GFX1164-NEXT: .LBB31_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 @@ -5970,14 +17632,14 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB24_2 +; GFX1132-NEXT: s_cbranch_execz .LBB31_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 ; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX1132-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB24_2: +; GFX1132-NEXT: .LBB31_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 @@ -6000,5 +17662,1046 @@ entry: store i64 %old, ptr addrspace(1) %out ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX11: {{.*}} + +define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { +; GFX7LESS_ITERATIVE-LABEL: umin_i64_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 +; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[4:5], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB32_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB32_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[1:2] +; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 +; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm +; +; GFX8_ITERATIVE-LABEL: umin_i64_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 +; GFX8_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX8_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX8_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 +; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB32_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 +; GFX8_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[1:2] +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5 +; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm +; +; GFX9_ITERATIVE-LABEL: umin_i64_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 +; GFX9_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX9_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX9_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 +; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB32_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 +; GFX9_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[1:2] +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5 +; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm +; +; GFX1064_ITERATIVE-LABEL: umin_i64_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[4:5] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 +; GFX1064_ITERATIVE-NEXT: v_cmp_lt_u64_e64 s[8:9], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB32_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1064_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[1:2] +; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc +; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm +; +; GFX1032_ITERATIVE-LABEL: umin_i64_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 +; GFX1032_ITERATIVE-NEXT: v_cmp_lt_u64_e64 s8, s[0:1], s[6:7] +; GFX1032_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo +; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB32_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1032_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[1:2] +; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo +; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm +; +; GFX1164_ITERATIVE-LABEL: umin_i64_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: .p2align 6 +; GFX1164_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_cmp_lt_u64_e64 s[8:9], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1164_ITERATIVE-NEXT: ds_min_rtn_u64 v[2:3], v4, v[2:3] +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB32_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] +; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc +; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm +; +; GFX1132_ITERATIVE-LABEL: umin_i64_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: .p2align 6 +; GFX1132_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_lt_u64_e64 s8, s[0:1], s[6:7] +; GFX1132_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo +; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1132_ITERATIVE-NEXT: ds_min_rtn_u64 v[2:3], v4, v[2:3] +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB32_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo +; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm +; +; GFX7LESS_DPP-LABEL: umin_i64_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: ds_min_rtn_u64 v[0:1], v1, v[0:1] +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7LESS_DPP-NEXT: s_endpgm +; +; GFX8_DPP-LABEL: umin_i64_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, -1 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB32_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_min_rtn_u64 v[9:10], v8, v[9:10] +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB32_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v10 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8] +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s5 +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: umin_i64_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, -1 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB32_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX9_DPP-NEXT: ds_min_rtn_u64 v[9:10], v8, v[9:10] +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB32_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v10 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8] +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s5 +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: umin_i64_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, -1 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 15 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s4, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s5, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v4, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v3, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB32_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1064_DPP-NEXT: ds_min_rtn_u64 v[11:12], v10, v[11:12] +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB32_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[9:10] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: umin_i64_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, -1 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 15 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB32_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1032_DPP-NEXT: ds_min_rtn_u64 v[11:12], v10, v[11:12] +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB32_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[9:10] +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc_lo +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: umin_i64_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v10 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, -1 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[5:6], v[3:4] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s4, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s5, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s6, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s7, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s8, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s9, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB32_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1164_DPP-NEXT: ds_min_rtn_u64 v[11:12], v10, v[11:12] +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB32_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[9:10] +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_store_b64 v[9:10], off, s[4:7], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: umin_i64_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_and_b32 v9, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v10 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, -1 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v7, v1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[5:6], v[3:4] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v3, v5 +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_mov_b32 v5, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_mov_b32 v7, v1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v6, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB32_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0 +; GFX1132_DPP-NEXT: ds_min_rtn_u64 v[11:12], v10, v[11:12] +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB32_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[9:10] +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc_lo +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc_lo +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_store_b64 v[9:10], off, s[4:7], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %lane_ext = zext i32 %lane to i64 + %old = atomicrmw umin ptr addrspace(3) @local_var32, i64 %lane_ext acq_rel + store i64 %old, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll index 995d3fee67291..f636fa5d83a57 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -573,13 +573,44 @@ entry: define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_mov_b64 s[0:1], exec +; GFX6-NEXT: s_mov_b32 s4, 0 +; GFX6-NEXT: ; implicit-def: $vgpr1 +; GFX6-NEXT: .LBB2_1: ; %ComputeLoop +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX6-NEXT: s_mov_b32 m0, s5 +; GFX6-NEXT: v_readlane_b32 s8, v0, s5 +; GFX6-NEXT: v_writelane_b32 v1, s4, m0 +; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX6-NEXT: s_add_i32 s4, s4, s8 +; GFX6-NEXT: s_cbranch_vccnz .LBB2_1 +; GFX6-NEXT: ; %bb.2: ; %ComputeEnd +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: ; implicit-def: $vgpr0 +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX6-NEXT: s_cbranch_execz .LBB2_4 +; GFX6-NEXT: ; %bb.3: +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; GFX6-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX6-NEXT: .LBB2_4: +; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_readfirstlane_b32 s4, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -1625,13 +1656,44 @@ entry: define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_mov_b64 s[0:1], exec +; GFX6-NEXT: s_mov_b32 s4, 0 +; GFX6-NEXT: ; implicit-def: $vgpr1 +; GFX6-NEXT: .LBB6_1: ; %ComputeLoop +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX6-NEXT: s_mov_b32 m0, s5 +; GFX6-NEXT: v_readlane_b32 s8, v0, s5 +; GFX6-NEXT: v_writelane_b32 v1, s4, m0 +; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX6-NEXT: s_add_i32 s4, s4, s8 +; GFX6-NEXT: s_cbranch_vccnz .LBB6_1 +; GFX6-NEXT: ; %bb.2: ; %ComputeEnd +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: ; implicit-def: $vgpr0 +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX6-NEXT: s_cbranch_execz .LBB6_4 +; GFX6-NEXT: ; %bb.3: +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc +; GFX6-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX6-NEXT: .LBB6_4: +; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_readfirstlane_b32 s4, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll index 720e2ef108076..3e8565d34c6be 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -589,14 +589,45 @@ entry: define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-NEXT: v_mov_b32_e32 v1, 0 +; GFX6-NEXT: s_mov_b64 s[0:1], exec +; GFX6-NEXT: s_mov_b32 s4, 0 +; GFX6-NEXT: ; implicit-def: $vgpr1 +; GFX6-NEXT: .LBB2_1: ; %ComputeLoop +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX6-NEXT: s_mov_b32 m0, s5 +; GFX6-NEXT: v_readlane_b32 s8, v0, s5 +; GFX6-NEXT: v_writelane_b32 v1, s4, m0 +; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX6-NEXT: s_add_i32 s4, s4, s8 +; GFX6-NEXT: s_cbranch_vccnz .LBB2_1 +; GFX6-NEXT: ; %bb.2: ; %ComputeEnd +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: ; implicit-def: $vgpr0 +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX6-NEXT: s_cbranch_execz .LBB2_4 +; GFX6-NEXT: ; %bb.3: +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_add v0, v1, s[4:7], 0 idxen glc +; GFX6-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc +; GFX6-NEXT: .LBB2_4: +; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_readfirstlane_b32 s4, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -1798,14 +1829,45 @@ entry: define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-NEXT: v_mov_b32_e32 v1, 0 +; GFX6-NEXT: s_mov_b64 s[0:1], exec +; GFX6-NEXT: s_mov_b32 s4, 0 +; GFX6-NEXT: ; implicit-def: $vgpr1 +; GFX6-NEXT: .LBB7_1: ; %ComputeLoop +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX6-NEXT: s_mov_b32 m0, s5 +; GFX6-NEXT: v_readlane_b32 s8, v0, s5 +; GFX6-NEXT: v_writelane_b32 v1, s4, m0 +; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX6-NEXT: s_add_i32 s4, s4, s8 +; GFX6-NEXT: s_cbranch_vccnz .LBB7_1 +; GFX6-NEXT: ; %bb.2: ; %ComputeEnd +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: ; implicit-def: $vgpr0 +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX6-NEXT: s_cbranch_execz .LBB7_4 +; GFX6-NEXT: ; %bb.3: +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_sub v0, v1, s[4:7], 0 idxen glc +; GFX6-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc +; GFX6-NEXT: .LBB7_4: +; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_readfirstlane_b32 s4, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll index 9d8b987d2ba68..997ba4053bb29 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll @@ -1,255 +1,1251 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefixes=GFX90A,GFX90A_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefixes=GFX90A,GFX90A_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefixes=GFX940,GFX940_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefixes=GFX940,GFX940_DPP %s define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic + ; GFX90A: bb.0 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: S_ENDPGM 0 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic + ; GFX940: bb.0 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret void } define amdgpu_ps double @global_atomic_fadd_f64_rtn_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY6]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY7]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic + ; GFX90A: bb.0 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX90A-NEXT: $sgpr0 = COPY [[COPY6]] + ; GFX90A-NEXT: $sgpr1 = COPY [[COPY7]] + ; GFX90A-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic + ; GFX940: bb.0 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX940-NEXT: $sgpr0 = COPY [[COPY6]] + ; GFX940-NEXT: $sgpr1 = COPY [[COPY7]] + ; GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret double %ret } define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic + ; GFX90A: bb.0 (%ir-block.0): + ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: S_ENDPGM 0 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic + ; GFX940: bb.0 (%ir-block.0): + ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret void } define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY5]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY6]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic + ; GFX90A: bb.0 (%ir-block.0): + ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 + ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 + ; GFX90A-NEXT: $sgpr0 = COPY [[COPY5]] + ; GFX90A-NEXT: $sgpr1 = COPY [[COPY6]] + ; GFX90A-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic + ; GFX940: bb.0 (%ir-block.0): + ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 + ; GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 + ; GFX940-NEXT: $sgpr0 = COPY [[COPY5]] + ; GFX940-NEXT: $sgpr1 = COPY [[COPY6]] + ; GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret double %ret } define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_flat_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic + ; GFX90A: bb.0 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: S_ENDPGM 0 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic + ; GFX940: bb.0 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret void } define amdgpu_ps double @global_atomic_fadd_f64_rtn_flat_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY6]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY7]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic + ; GFX90A: bb.0 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX90A-NEXT: $sgpr0 = COPY [[COPY6]] + ; GFX90A-NEXT: $sgpr1 = COPY [[COPY7]] + ; GFX90A-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic + ; GFX940: bb.0 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX940-NEXT: $sgpr0 = COPY [[COPY6]] + ; GFX940-NEXT: $sgpr1 = COPY [[COPY7]] + ; GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret double %ret } define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic + ; GFX90A: bb.0 (%ir-block.0): + ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: S_ENDPGM 0 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic + ; GFX940: bb.0 (%ir-block.0): + ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret void } define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY5]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY6]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic + ; GFX90A: bb.0 (%ir-block.0): + ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 + ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 + ; GFX90A-NEXT: $sgpr0 = COPY [[COPY5]] + ; GFX90A-NEXT: $sgpr1 = COPY [[COPY6]] + ; GFX90A-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic + ; GFX940: bb.0 (%ir-block.0): + ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 + ; GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 + ; GFX940-NEXT: $sgpr0 = COPY [[COPY5]] + ; GFX940-NEXT: $sgpr1 = COPY [[COPY6]] + ; GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret double %ret } define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_atomicrmw(ptr addrspace(1) %ptr, double %data) #0 { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw + ; GFX90A: bb.0 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: S_ENDPGM 0 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw + ; GFX940: bb.0 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic ret void } define amdgpu_ps double @global_atomic_fadd_f64_rtn_atomicrmw(ptr addrspace(1) %ptr, double %data) #0 { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY6]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY7]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw + ; GFX90A: bb.0 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX90A-NEXT: $sgpr0 = COPY [[COPY6]] + ; GFX90A-NEXT: $sgpr1 = COPY [[COPY7]] + ; GFX90A-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw + ; GFX940: bb.0 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX940-NEXT: $sgpr0 = COPY [[COPY6]] + ; GFX940-NEXT: $sgpr1 = COPY [[COPY7]] + ; GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic ret double %ret } define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, double %data) #0 { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw + ; GFX90A_ITERATIVE: bb.0 (%ir-block.0): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000) + ; GFX90A_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_ITERATIVE-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX90A_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE + ; GFX90A_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.1 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.1 (%ir-block.5): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.6(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX90A_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[COPY6]] + ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.6 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.2 (%ir-block.7): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.3(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY %42 + ; GFX90A_ITERATIVE-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], [[COPY8]], [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.3.Flow: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.5(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: SI_END_CF %7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.5 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.4 (%ir-block.9): + ; GFX90A_ITERATIVE-NEXT: S_ENDPGM 0 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.5.Flow1: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.4(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.4 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.6.ComputeLoop: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.7(0x04000000), %bb.6(0x7c000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[V_MOV_B]], %bb.1, %26, %bb.6 + ; GFX90A_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:sreg_64 = PHI [[COPY7]], %bb.1, %5, %bb.6 + ; GFX90A_ITERATIVE-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sreg_32 = S_FF1_I32_B64 [[PHI1]] + ; GFX90A_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY9]], [[S_FF1_I32_B64_]] + ; GFX90A_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY10]], [[S_FF1_I32_B64_]] + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY killed [[REG_SEQUENCE2]] + ; GFX90A_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI]], 0, [[COPY11]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1 + ; GFX90A_ITERATIVE-NEXT: [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = S_LSHL_B64 killed [[S_MOV_B64_]], [[S_FF1_I32_B64_]], implicit-def dead $scc + ; GFX90A_ITERATIVE-NEXT: [[S_ANDN2_B64_:%[0-9]+]]:sreg_64 = S_ANDN2_B64 [[PHI1]], killed [[S_LSHL_B64_]], implicit-def dead $scc + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX90A_ITERATIVE-NEXT: S_CMP_LG_U64 [[S_ANDN2_B64_]], killed [[S_MOV_B64_1]], implicit-def $scc + ; GFX90A_ITERATIVE-NEXT: S_CBRANCH_SCC1 %bb.6, implicit $scc + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.7 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.7.ComputeEnd: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_]], %bb.6 + ; GFX90A_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY13]], [[COPY14]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY12]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.2 + ; + ; GFX90A_DPP-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw + ; GFX90A_DPP: bb.0 (%ir-block.0): + ; GFX90A_DPP-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000) + ; GFX90A_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_DPP-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_DPP-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_DPP-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_DPP-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_DPP-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX90A_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE + ; GFX90A_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_DPP-NEXT: S_BRANCH %bb.1 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.1 (%ir-block.5): + ; GFX90A_DPP-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[COPY6:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX90A_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub1 + ; GFX90A_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub0 + ; GFX90A_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_DPP-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX90A_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY8]], [[COPY9]], implicit $exec + ; GFX90A_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY7]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec + ; GFX90A_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[COPY4]], [[V_MOV_B]], implicit-def dead $scc, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, killed [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, killed [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, killed [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, killed [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, killed [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, killed [[V_MOV_B6]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1 + ; GFX90A_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX90A_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY10]], [[S_MOV_B32_1]] + ; GFX90A_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0 + ; GFX90A_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY11]], [[S_MOV_B32_1]] + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1 + ; GFX90A_DPP-NEXT: early-clobber %1:sreg_64 = STRICT_WWM killed [[REG_SEQUENCE2]], implicit $exec + ; GFX90A_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec + ; GFX90A_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_DPP-NEXT: S_BRANCH %bb.2 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.2 (%ir-block.31): + ; GFX90A_DPP-NEXT: successors: %bb.3(0x80000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY %1 + ; GFX90A_DPP-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], [[COPY12]], [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.3.Flow: + ; GFX90A_DPP-NEXT: successors: %bb.4(0x80000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.4 (%ir-block.33): + ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_DPP-NEXT: S_ENDPGM 0 + ; + ; GFX940_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw + ; GFX940_ITERATIVE: bb.0 (%ir-block.0): + ; GFX940_ITERATIVE-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000) + ; GFX940_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX940_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX940_ITERATIVE-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX940_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE + ; GFX940_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.1 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.1 (%ir-block.5): + ; GFX940_ITERATIVE-NEXT: successors: %bb.6(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX940_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX940_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[COPY6]] + ; GFX940_ITERATIVE-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.6 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.2 (%ir-block.7): + ; GFX940_ITERATIVE-NEXT: successors: %bb.3(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY %41 + ; GFX940_ITERATIVE-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], [[COPY8]], [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.3.Flow: + ; GFX940_ITERATIVE-NEXT: successors: %bb.5(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: SI_END_CF %7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.5 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.4 (%ir-block.9): + ; GFX940_ITERATIVE-NEXT: S_ENDPGM 0 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.5.Flow1: + ; GFX940_ITERATIVE-NEXT: successors: %bb.4(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.4 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.6.ComputeLoop: + ; GFX940_ITERATIVE-NEXT: successors: %bb.7(0x04000000), %bb.6(0x7c000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[V_MOV_B]], %bb.1, %25, %bb.6 + ; GFX940_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:sreg_64 = PHI [[COPY7]], %bb.1, %5, %bb.6 + ; GFX940_ITERATIVE-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sreg_32 = S_FF1_I32_B64 [[PHI1]] + ; GFX940_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub1 + ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY9]], [[S_FF1_I32_B64_]] + ; GFX940_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub0 + ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY10]], [[S_FF1_I32_B64_]] + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY killed [[REG_SEQUENCE2]] + ; GFX940_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI]], 0, [[COPY11]], 0, 0, implicit $mode, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1 + ; GFX940_ITERATIVE-NEXT: [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = S_LSHL_B64 killed [[S_MOV_B64_]], [[S_FF1_I32_B64_]], implicit-def dead $scc + ; GFX940_ITERATIVE-NEXT: [[S_ANDN2_B64_:%[0-9]+]]:sreg_64 = S_ANDN2_B64 [[PHI1]], killed [[S_LSHL_B64_]], implicit-def dead $scc + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX940_ITERATIVE-NEXT: S_CMP_LG_U64 [[S_ANDN2_B64_]], killed [[S_MOV_B64_1]], implicit-def $scc + ; GFX940_ITERATIVE-NEXT: S_CBRANCH_SCC1 %bb.6, implicit $scc + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.7 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.7.ComputeEnd: + ; GFX940_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_]], %bb.6 + ; GFX940_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub0 + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY13]], [[COPY14]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY12]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.2 + ; + ; GFX940_DPP-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw + ; GFX940_DPP: bb.0 (%ir-block.0): + ; GFX940_DPP-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000) + ; GFX940_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940_DPP-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940_DPP-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX940_DPP-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX940_DPP-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_DPP-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX940_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE + ; GFX940_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_DPP-NEXT: S_BRANCH %bb.1 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.1 (%ir-block.5): + ; GFX940_DPP-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[COPY6:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX940_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub1 + ; GFX940_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub0 + ; GFX940_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940_DPP-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX940_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY8]], [[COPY9]], implicit $exec + ; GFX940_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY7]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec + ; GFX940_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[COPY4]], [[V_MOV_B]], implicit-def dead $scc, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, killed [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, killed [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, killed [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, killed [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, killed [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, killed [[V_MOV_B6]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1 + ; GFX940_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX940_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY10]], [[S_MOV_B32_1]] + ; GFX940_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0 + ; GFX940_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY11]], [[S_MOV_B32_1]] + ; GFX940_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1 + ; GFX940_DPP-NEXT: early-clobber %1:sreg_64 = STRICT_WWM killed [[REG_SEQUENCE2]], implicit $exec + ; GFX940_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec + ; GFX940_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_DPP-NEXT: S_BRANCH %bb.2 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.2 (%ir-block.31): + ; GFX940_DPP-NEXT: successors: %bb.3(0x80000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940_DPP-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY %1 + ; GFX940_DPP-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], [[COPY12]], [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.3.Flow: + ; GFX940_DPP-NEXT: successors: %bb.4(0x80000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.4 (%ir-block.33): + ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_DPP-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic ret void } define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, double %data) #0 { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY5]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY6]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX90A_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw + ; GFX90A_ITERATIVE: bb.0 (%ir-block.0): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000) + ; GFX90A_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_ITERATIVE-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX90A_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE + ; GFX90A_ITERATIVE-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[DEF2]] + ; GFX90A_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.1 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.1 (%ir-block.5): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.6(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX90A_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[COPY7]] + ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[DEF3]] + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.6 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.2 (%ir-block.7): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.3(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vreg_64_align2 = COPY %81 + ; GFX90A_ITERATIVE-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY10]], [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.3 (%ir-block.9): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.5(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI %91, %bb.7, [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.2 + ; GFX90A_ITERATIVE-NEXT: SI_END_CF %15, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY11]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_1]], %subreg.sub0, [[V_READFIRSTLANE_B32_]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, killed [[REG_SEQUENCE2]], 0, %12, 0, 0, implicit $mode, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:sreg_64_xexec = COPY %14 + ; GFX90A_ITERATIVE-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A_ITERATIVE-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[COPY13]], 0, [[COPY15]], [[COPY14]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY17:%[0-9]+]]:sreg_64_xexec = COPY %14 + ; GFX90A_ITERATIVE-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX90A_ITERATIVE-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[COPY16]], 0, [[COPY18]], [[COPY17]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_CNDMASK_B32_e64_1]], %subreg.sub0, [[V_CNDMASK_B32_e64_]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY19:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.5 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.4 (%ir-block.14): + ; GFX90A_ITERATIVE-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY %5.sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY %5.sub1 + ; GFX90A_ITERATIVE-NEXT: $sgpr0 = COPY [[COPY20]] + ; GFX90A_ITERATIVE-NEXT: $sgpr1 = COPY [[COPY21]] + ; GFX90A_ITERATIVE-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.5.Flow: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.4(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[COPY6]], %bb.0, [[COPY19]], %bb.3 + ; GFX90A_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.4 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.6.ComputeLoop: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.7(0x04000000), %bb.6(0x7c000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[V_MOV_B]], %bb.1, %43, %bb.6 + ; GFX90A_ITERATIVE-NEXT: [[PHI3:%[0-9]+]]:vreg_64_align2 = PHI [[COPY9]], %bb.1, %9, %bb.6 + ; GFX90A_ITERATIVE-NEXT: [[PHI4:%[0-9]+]]:sreg_64 = PHI [[COPY8]], %bb.1, %11, %bb.6 + ; GFX90A_ITERATIVE-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sreg_32 = S_FF1_I32_B64 [[PHI4]] + ; GFX90A_ITERATIVE-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY22]], [[S_FF1_I32_B64_]] + ; GFX90A_ITERATIVE-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY23]], [[S_FF1_I32_B64_]] + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1 + ; GFX90A_ITERATIVE-NEXT: $m0 = COPY [[S_FF1_I32_B64_]] + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY25]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_2]], $m0, [[COPY24]] + ; GFX90A_ITERATIVE-NEXT: [[COPY26:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0 + ; GFX90A_ITERATIVE-NEXT: $m0 = COPY [[S_FF1_I32_B64_]] + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY27]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_3]], $m0, [[COPY26]] + ; GFX90A_ITERATIVE-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_WRITELANE_B32_1]], %subreg.sub0, [[V_WRITELANE_B32_]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY28:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE5]] + ; GFX90A_ITERATIVE-NEXT: [[COPY29:%[0-9]+]]:sreg_64 = COPY killed [[REG_SEQUENCE4]] + ; GFX90A_ITERATIVE-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI2]], 0, [[COPY29]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[DEF8:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1 + ; GFX90A_ITERATIVE-NEXT: [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = S_LSHL_B64 killed [[S_MOV_B64_]], [[S_FF1_I32_B64_]], implicit-def dead $scc + ; GFX90A_ITERATIVE-NEXT: [[S_ANDN2_B64_:%[0-9]+]]:sreg_64 = S_ANDN2_B64 [[PHI4]], killed [[S_LSHL_B64_]], implicit-def dead $scc + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX90A_ITERATIVE-NEXT: S_CMP_LG_U64 [[S_ANDN2_B64_]], killed [[S_MOV_B64_1]], implicit-def $scc + ; GFX90A_ITERATIVE-NEXT: S_CBRANCH_SCC1 %bb.6, implicit $scc + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.7 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.7.ComputeEnd: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[PHI5:%[0-9]+]]:vreg_64_align2 = PHI [[COPY28]], %bb.6 + ; GFX90A_ITERATIVE-NEXT: [[PHI6:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_1]], %bb.6 + ; GFX90A_ITERATIVE-NEXT: [[COPY30:%[0-9]+]]:sreg_32 = COPY [[COPY8]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY31:%[0-9]+]]:sreg_32 = COPY [[COPY8]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_ITERATIVE-NEXT: [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY31]], [[COPY32]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY30]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY33:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]] + ; GFX90A_ITERATIVE-NEXT: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[COPY34:%[0-9]+]]:vreg_64_align2 = COPY [[DEF9]] + ; GFX90A_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.2 + ; + ; GFX90A_DPP-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw + ; GFX90A_DPP: bb.0 (%ir-block.0): + ; GFX90A_DPP-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; GFX90A_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_DPP-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_DPP-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_DPP-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_DPP-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_DPP-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX90A_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE + ; GFX90A_DPP-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX90A_DPP-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[DEF2]] + ; GFX90A_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_DPP-NEXT: S_BRANCH %bb.1 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.1 (%ir-block.5): + ; GFX90A_DPP-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX90A_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub1 + ; GFX90A_DPP-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub0 + ; GFX90A_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX90A_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY9]], [[COPY10]], implicit $exec + ; GFX90A_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY8]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec + ; GFX90A_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[COPY4]], [[V_MOV_B]], implicit-def dead $scc, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, killed [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, killed [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, killed [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, killed [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, killed [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, killed [[V_MOV_B6]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B7:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_5]], 312, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1 + ; GFX90A_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX90A_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY11]], [[S_MOV_B32_1]] + ; GFX90A_DPP-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0 + ; GFX90A_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY12]], [[S_MOV_B32_1]] + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1 + ; GFX90A_DPP-NEXT: early-clobber %2:sreg_64 = STRICT_WWM killed [[REG_SEQUENCE2]], implicit $exec + ; GFX90A_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec + ; GFX90A_DPP-NEXT: [[COPY13:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]] + ; GFX90A_DPP-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX90A_DPP-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY [[DEF3]] + ; GFX90A_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_DPP-NEXT: S_BRANCH %bb.2 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.2 (%ir-block.32): + ; GFX90A_DPP-NEXT: successors: %bb.4(0x80000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY15:%[0-9]+]]:vreg_64_align2 = COPY %2 + ; GFX90A_DPP-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY15]], [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A_DPP-NEXT: S_BRANCH %bb.4 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.3.Flow: + ; GFX90A_DPP-NEXT: successors: %bb.5(0x80000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[COPY6]], %bb.0, %8, %bb.4 + ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_DPP-NEXT: S_BRANCH %bb.5 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.4 (%ir-block.35): + ; GFX90A_DPP-NEXT: successors: %bb.3(0x80000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[COPY14]], %bb.1, [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.2 + ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1 + ; GFX90A_DPP-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY16]], implicit $exec + ; GFX90A_DPP-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0 + ; GFX90A_DPP-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY17]], implicit $exec + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_1]], %subreg.sub0, [[V_READFIRSTLANE_B32_]], %subreg.sub1 + ; GFX90A_DPP-NEXT: early-clobber %56:vreg_64_align2 = STRICT_WWM [[V_MOV_B7]], implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_6:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, killed [[REG_SEQUENCE3]], 0, killed %56, 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_6]].sub1 + ; GFX90A_DPP-NEXT: [[COPY19:%[0-9]+]]:sreg_64_xexec = COPY [[COPY13]] + ; GFX90A_DPP-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A_DPP-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[COPY18]], 0, [[COPY20]], [[COPY19]], implicit $exec + ; GFX90A_DPP-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_6]].sub0 + ; GFX90A_DPP-NEXT: [[COPY22:%[0-9]+]]:sreg_64_xexec = COPY [[COPY13]] + ; GFX90A_DPP-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX90A_DPP-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[COPY21]], 0, [[COPY23]], [[COPY22]], implicit $exec + ; GFX90A_DPP-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_DPP-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_CNDMASK_B32_e64_1]], %subreg.sub0, [[V_CNDMASK_B32_e64_]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[COPY24:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE4]] + ; GFX90A_DPP-NEXT: S_BRANCH %bb.3 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.5 (%ir-block.41): + ; GFX90A_DPP-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0 + ; GFX90A_DPP-NEXT: [[COPY26:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1 + ; GFX90A_DPP-NEXT: $sgpr0 = COPY [[COPY25]] + ; GFX90A_DPP-NEXT: $sgpr1 = COPY [[COPY26]] + ; GFX90A_DPP-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; + ; GFX940_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw + ; GFX940_ITERATIVE: bb.0 (%ir-block.0): + ; GFX940_ITERATIVE-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000) + ; GFX940_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX940_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX940_ITERATIVE-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX940_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE + ; GFX940_ITERATIVE-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[DEF2]] + ; GFX940_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.1 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.1 (%ir-block.5): + ; GFX940_ITERATIVE-NEXT: successors: %bb.6(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX940_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX940_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[COPY7]] + ; GFX940_ITERATIVE-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[DEF3]] + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.6 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.2 (%ir-block.7): + ; GFX940_ITERATIVE-NEXT: successors: %bb.3(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vreg_64_align2 = COPY %80 + ; GFX940_ITERATIVE-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY10]], [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.3 (%ir-block.9): + ; GFX940_ITERATIVE-NEXT: successors: %bb.5(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI %90, %bb.7, [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.2 + ; GFX940_ITERATIVE-NEXT: SI_END_CF %15, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1 + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY11]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0 + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_1]], %subreg.sub0, [[V_READFIRSTLANE_B32_]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, killed [[REG_SEQUENCE2]], 0, %12, 0, 0, implicit $mode, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_]].sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:sreg_64_xexec = COPY %14 + ; GFX940_ITERATIVE-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX940_ITERATIVE-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[COPY13]], 0, [[COPY15]], [[COPY14]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY17:%[0-9]+]]:sreg_64_xexec = COPY %14 + ; GFX940_ITERATIVE-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX940_ITERATIVE-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[COPY16]], 0, [[COPY18]], [[COPY17]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_CNDMASK_B32_e64_1]], %subreg.sub0, [[V_CNDMASK_B32_e64_]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY19:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.5 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.4 (%ir-block.14): + ; GFX940_ITERATIVE-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY %5.sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY %5.sub1 + ; GFX940_ITERATIVE-NEXT: $sgpr0 = COPY [[COPY20]] + ; GFX940_ITERATIVE-NEXT: $sgpr1 = COPY [[COPY21]] + ; GFX940_ITERATIVE-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.5.Flow: + ; GFX940_ITERATIVE-NEXT: successors: %bb.4(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[COPY6]], %bb.0, [[COPY19]], %bb.3 + ; GFX940_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.4 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.6.ComputeLoop: + ; GFX940_ITERATIVE-NEXT: successors: %bb.7(0x04000000), %bb.6(0x7c000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[V_MOV_B]], %bb.1, %42, %bb.6 + ; GFX940_ITERATIVE-NEXT: [[PHI3:%[0-9]+]]:vreg_64_align2 = PHI [[COPY9]], %bb.1, %9, %bb.6 + ; GFX940_ITERATIVE-NEXT: [[PHI4:%[0-9]+]]:sreg_64 = PHI [[COPY8]], %bb.1, %11, %bb.6 + ; GFX940_ITERATIVE-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sreg_32 = S_FF1_I32_B64 [[PHI4]] + ; GFX940_ITERATIVE-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub1 + ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY22]], [[S_FF1_I32_B64_]] + ; GFX940_ITERATIVE-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub0 + ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY23]], [[S_FF1_I32_B64_]] + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1 + ; GFX940_ITERATIVE-NEXT: $m0 = COPY [[S_FF1_I32_B64_]] + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY25]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_2]], $m0, [[COPY24]] + ; GFX940_ITERATIVE-NEXT: [[COPY26:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0 + ; GFX940_ITERATIVE-NEXT: $m0 = COPY [[S_FF1_I32_B64_]] + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY27]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_3]], $m0, [[COPY26]] + ; GFX940_ITERATIVE-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_WRITELANE_B32_1]], %subreg.sub0, [[V_WRITELANE_B32_]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY28:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE5]] + ; GFX940_ITERATIVE-NEXT: [[COPY29:%[0-9]+]]:sreg_64 = COPY killed [[REG_SEQUENCE4]] + ; GFX940_ITERATIVE-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI2]], 0, [[COPY29]], 0, 0, implicit $mode, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[DEF8:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1 + ; GFX940_ITERATIVE-NEXT: [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = S_LSHL_B64 killed [[S_MOV_B64_]], [[S_FF1_I32_B64_]], implicit-def dead $scc + ; GFX940_ITERATIVE-NEXT: [[S_ANDN2_B64_:%[0-9]+]]:sreg_64 = S_ANDN2_B64 [[PHI4]], killed [[S_LSHL_B64_]], implicit-def dead $scc + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX940_ITERATIVE-NEXT: S_CMP_LG_U64 [[S_ANDN2_B64_]], killed [[S_MOV_B64_1]], implicit-def $scc + ; GFX940_ITERATIVE-NEXT: S_CBRANCH_SCC1 %bb.6, implicit $scc + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.7 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.7.ComputeEnd: + ; GFX940_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[PHI5:%[0-9]+]]:vreg_64_align2 = PHI [[COPY28]], %bb.6 + ; GFX940_ITERATIVE-NEXT: [[PHI6:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_1]], %bb.6 + ; GFX940_ITERATIVE-NEXT: [[COPY30:%[0-9]+]]:sreg_32 = COPY [[COPY8]].sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY31:%[0-9]+]]:sreg_32 = COPY [[COPY8]].sub0 + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940_ITERATIVE-NEXT: [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY31]], [[COPY32]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY30]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY33:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]] + ; GFX940_ITERATIVE-NEXT: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[COPY34:%[0-9]+]]:vreg_64_align2 = COPY [[DEF9]] + ; GFX940_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.2 + ; + ; GFX940_DPP-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw + ; GFX940_DPP: bb.0 (%ir-block.0): + ; GFX940_DPP-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; GFX940_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940_DPP-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940_DPP-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX940_DPP-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX940_DPP-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_DPP-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX940_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE + ; GFX940_DPP-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX940_DPP-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[DEF2]] + ; GFX940_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_DPP-NEXT: S_BRANCH %bb.1 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.1 (%ir-block.5): + ; GFX940_DPP-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX940_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub1 + ; GFX940_DPP-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub0 + ; GFX940_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX940_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY9]], [[COPY10]], implicit $exec + ; GFX940_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY8]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec + ; GFX940_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[COPY4]], [[V_MOV_B]], implicit-def dead $scc, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, killed [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, killed [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, killed [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, killed [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, killed [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, killed [[V_MOV_B6]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B7:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_5]], 312, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1 + ; GFX940_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX940_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY11]], [[S_MOV_B32_1]] + ; GFX940_DPP-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0 + ; GFX940_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY12]], [[S_MOV_B32_1]] + ; GFX940_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1 + ; GFX940_DPP-NEXT: early-clobber %2:sreg_64 = STRICT_WWM killed [[REG_SEQUENCE2]], implicit $exec + ; GFX940_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec + ; GFX940_DPP-NEXT: [[COPY13:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]] + ; GFX940_DPP-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX940_DPP-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY [[DEF3]] + ; GFX940_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_DPP-NEXT: S_BRANCH %bb.2 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.2 (%ir-block.32): + ; GFX940_DPP-NEXT: successors: %bb.4(0x80000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940_DPP-NEXT: [[COPY15:%[0-9]+]]:vreg_64_align2 = COPY %2 + ; GFX940_DPP-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY15]], [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX940_DPP-NEXT: S_BRANCH %bb.4 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.3.Flow: + ; GFX940_DPP-NEXT: successors: %bb.5(0x80000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[COPY6]], %bb.0, %8, %bb.4 + ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_DPP-NEXT: S_BRANCH %bb.5 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.4 (%ir-block.35): + ; GFX940_DPP-NEXT: successors: %bb.3(0x80000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[COPY14]], %bb.1, [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.2 + ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_DPP-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1 + ; GFX940_DPP-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY16]], implicit $exec + ; GFX940_DPP-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0 + ; GFX940_DPP-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY17]], implicit $exec + ; GFX940_DPP-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_1]], %subreg.sub0, [[V_READFIRSTLANE_B32_]], %subreg.sub1 + ; GFX940_DPP-NEXT: early-clobber %55:vreg_64_align2 = STRICT_WWM [[V_MOV_B7]], implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_6:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, killed [[REG_SEQUENCE3]], 0, killed %55, 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_6]].sub1 + ; GFX940_DPP-NEXT: [[COPY19:%[0-9]+]]:sreg_64_xexec = COPY [[COPY13]] + ; GFX940_DPP-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX940_DPP-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[COPY18]], 0, [[COPY20]], [[COPY19]], implicit $exec + ; GFX940_DPP-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_6]].sub0 + ; GFX940_DPP-NEXT: [[COPY22:%[0-9]+]]:sreg_64_xexec = COPY [[COPY13]] + ; GFX940_DPP-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX940_DPP-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[COPY21]], 0, [[COPY23]], [[COPY22]], implicit $exec + ; GFX940_DPP-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_DPP-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_DPP-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_CNDMASK_B32_e64_1]], %subreg.sub0, [[V_CNDMASK_B32_e64_]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[COPY24:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE4]] + ; GFX940_DPP-NEXT: S_BRANCH %bb.3 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.5 (%ir-block.41): + ; GFX940_DPP-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0 + ; GFX940_DPP-NEXT: [[COPY26:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1 + ; GFX940_DPP-NEXT: $sgpr0 = COPY [[COPY25]] + ; GFX940_DPP-NEXT: $sgpr1 = COPY [[COPY26]] + ; GFX940_DPP-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic ret double %ret } diff --git a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll index dab5e991d7d43..345b1b601d6a8 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll @@ -1013,9 +1013,89 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_agent_s } define amdgpu_ps double @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double %val) #0 { -; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe( -; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-NEXT: ret double [[RESULT]] +; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP23:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi double [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP13]]) +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = fadd double [[TMP14]], [[TMP22:%.*]] +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = select i1 [[TMP28:%.*]], double [[TMP14]], double [[TMP15]] +; IR-ITERATIVE-NEXT: br label [[TMP17]] +; IR-ITERATIVE: 17: +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP16]], [[TMP12]] ] +; IR-ITERATIVE-NEXT: ret double [[TMP18]] +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP23]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP26:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP20]]) +; IR-ITERATIVE-NEXT: [[TMP22]] = call double @llvm.amdgcn.writelane.f64(double [[ACCUMULATOR]], i32 [[TMP20]], double [[OLDVALUEPHI]]) +; IR-ITERATIVE-NEXT: [[TMP23]] = fadd double [[ACCUMULATOR]], [[TMP21]] +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = shl i64 1, [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = xor i64 [[TMP24]], -1 +; IR-ITERATIVE-NEXT: [[TMP26]] = and i64 [[ACTIVEBITS]], [[TMP25]] +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i64 [[TMP26]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[COMPUTEEND]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP28]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP28]], label [[TMP10]], label [[TMP12]] +; +; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP34:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00) +; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP11:%.*]] = fadd double [[TMP9]], [[TMP10]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP13:%.*]] = fadd double [[TMP11]], [[TMP12]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP15:%.*]] = fadd double [[TMP13]], [[TMP14]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP17:%.*]] = fadd double [[TMP15]], [[TMP16]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP19:%.*]] = fadd double [[TMP17]], [[TMP18]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP21:%.*]] = fadd double [[TMP19]], [[TMP20]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP21]], i32 312, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) +; IR-DPP-NEXT: [[TMP24:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP23]]) +; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]] +; IR-DPP: 26: +; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP24]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: [[TMP29:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ] +; IR-DPP-NEXT: [[TMP30:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP29]]) +; IR-DPP-NEXT: [[TMP31:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) +; IR-DPP-NEXT: [[TMP32:%.*]] = fadd double [[TMP30]], [[TMP31]] +; IR-DPP-NEXT: [[TMP33:%.*]] = select i1 [[TMP25]], double [[TMP30]], double [[TMP32]] +; IR-DPP-NEXT: br label [[TMP34]] +; IR-DPP: 34: +; IR-DPP-NEXT: [[TMP35:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP33]], [[TMP28]] ] +; IR-DPP-NEXT: ret double [[TMP35]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic, align 4 ret double %result @@ -1089,9 +1169,89 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_one_as_ } define amdgpu_ps double @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, double %val) #1 { -; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp( -; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("one-as") monotonic, align 8 -; IR-NEXT: ret double [[RESULT]] +; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP23:%.*]] syncscope("one-as") monotonic, align 8 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi double [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP13]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP14]], double [[TMP22:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = select i1 [[TMP28:%.*]], double [[TMP14]], double [[TMP15]] +; IR-ITERATIVE-NEXT: br label [[TMP17]] +; IR-ITERATIVE: 17: +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP16]], [[TMP12]] ] +; IR-ITERATIVE-NEXT: ret double [[TMP18]] +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP23]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP26:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP20]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP22]] = call double @llvm.amdgcn.writelane.f64(double [[ACCUMULATOR]], i32 [[TMP20]], double [[OLDVALUEPHI]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP23]] = call double @llvm.experimental.constrained.fadd.f64(double [[ACCUMULATOR]], double [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = shl i64 1, [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = xor i64 [[TMP24]], -1 +; IR-ITERATIVE-NEXT: [[TMP26]] = and i64 [[ACTIVEBITS]], [[TMP25]] +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i64 [[TMP26]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[COMPUTEEND]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP28]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP28]], label [[TMP10]], label [[TMP12]] +; +; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP34:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP9]], double [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP11]], double [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP13]], double [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP15]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP17]], double [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP19]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP21]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP23]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]] +; IR-DPP: 26: +; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP24]] syncscope("one-as") monotonic, align 8 +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: [[TMP29:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ] +; IR-DPP-NEXT: [[TMP30:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP29]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP31:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP32:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP30]], double [[TMP31]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP33:%.*]] = select i1 [[TMP25]], double [[TMP30]], double [[TMP32]] +; IR-DPP-NEXT: br label [[TMP34]] +; IR-DPP: 34: +; IR-DPP-NEXT: [[TMP35:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP33]], [[TMP28]] ] +; IR-DPP-NEXT: ret double [[TMP35]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic ret double %result @@ -1165,9 +1325,89 @@ define amdgpu_ps double @global_atomic_fsub_double_uni_address_uni_value_agent_s } define amdgpu_ps double @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, double %val) #2 { -; IR-LABEL: @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp( -; IR-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 -; IR-NEXT: ret double [[RESULT]] +; IR-ITERATIVE-LABEL: @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[TMP23:%.*]] syncscope("agent") monotonic, align 8 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi double [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP13]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fsub.f64(double [[TMP14]], double [[TMP22:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = select i1 [[TMP28:%.*]], double [[TMP14]], double [[TMP15]] +; IR-ITERATIVE-NEXT: br label [[TMP17]] +; IR-ITERATIVE: 17: +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP16]], [[TMP12]] ] +; IR-ITERATIVE-NEXT: ret double [[TMP18]] +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP23]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP26:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP20]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP22]] = call double @llvm.amdgcn.writelane.f64(double [[ACCUMULATOR]], i32 [[TMP20]], double [[OLDVALUEPHI]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP23]] = call double @llvm.experimental.constrained.fadd.f64(double [[ACCUMULATOR]], double [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = shl i64 1, [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = xor i64 [[TMP24]], -1 +; IR-ITERATIVE-NEXT: [[TMP26]] = and i64 [[ACTIVEBITS]], [[TMP25]] +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i64 [[TMP26]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[COMPUTEEND]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP28]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP28]], label [[TMP10]], label [[TMP12]] +; +; IR-DPP-LABEL: @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP34:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP9]], double [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP11]], double [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP13]], double [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP15]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP17]], double [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP19]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP21]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP23]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]] +; IR-DPP: 26: +; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[TMP24]] syncscope("agent") monotonic, align 8 +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: [[TMP29:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ] +; IR-DPP-NEXT: [[TMP30:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP29]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP31:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP32:%.*]] = call double @llvm.experimental.constrained.fsub.f64(double [[TMP30]], double [[TMP31]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP33:%.*]] = select i1 [[TMP25]], double [[TMP30]], double [[TMP32]] +; IR-DPP-NEXT: br label [[TMP34]] +; IR-DPP: 34: +; IR-DPP-NEXT: [[TMP35:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP33]], [[TMP28]] ] +; IR-DPP-NEXT: ret double [[TMP35]] ; %result = atomicrmw fsub ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic ret double %result @@ -1206,9 +1446,89 @@ define amdgpu_ps double @global_atomic_fmin_double_uni_address_uni_value_agent_s } define amdgpu_ps double @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double %val) #0 { -; IR-LABEL: @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe( -; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 -; IR-NEXT: ret double [[RESULT]] +; IR-ITERATIVE-LABEL: @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[TMP23:%.*]] syncscope("agent") monotonic, align 8 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi double [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP13]]) +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call double @llvm.minnum.f64(double [[TMP14]], double [[TMP22:%.*]]) +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = select i1 [[TMP28:%.*]], double [[TMP14]], double [[TMP15]] +; IR-ITERATIVE-NEXT: br label [[TMP17]] +; IR-ITERATIVE: 17: +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP16]], [[TMP12]] ] +; IR-ITERATIVE-NEXT: ret double [[TMP18]] +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ 0x7FF8000000000000, [[TMP2]] ], [ [[TMP23]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP26:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP20]]) +; IR-ITERATIVE-NEXT: [[TMP22]] = call double @llvm.amdgcn.writelane.f64(double [[ACCUMULATOR]], i32 [[TMP20]], double [[OLDVALUEPHI]]) +; IR-ITERATIVE-NEXT: [[TMP23]] = call double @llvm.minnum.f64(double [[ACCUMULATOR]], double [[TMP21]]) +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = shl i64 1, [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = xor i64 [[TMP24]], -1 +; IR-ITERATIVE-NEXT: [[TMP26]] = and i64 [[ACTIVEBITS]], [[TMP25]] +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i64 [[TMP26]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[COMPUTEEND]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP28]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP28]], label [[TMP10]], label [[TMP12]] +; +; IR-DPP-LABEL: @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP34:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double 0x7FF8000000000000) +; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.minnum.f64(double [[TMP9]], double [[TMP10]]) +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.minnum.f64(double [[TMP11]], double [[TMP12]]) +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.minnum.f64(double [[TMP13]], double [[TMP14]]) +; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.minnum.f64(double [[TMP15]], double [[TMP16]]) +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.minnum.f64(double [[TMP17]], double [[TMP18]]) +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.minnum.f64(double [[TMP19]], double [[TMP20]]) +; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP21]], i32 312, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) +; IR-DPP-NEXT: [[TMP24:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP23]]) +; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]] +; IR-DPP: 26: +; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[TMP24]] syncscope("agent") monotonic, align 8 +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: [[TMP29:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ] +; IR-DPP-NEXT: [[TMP30:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP29]]) +; IR-DPP-NEXT: [[TMP31:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) +; IR-DPP-NEXT: [[TMP32:%.*]] = call double @llvm.minnum.f64(double [[TMP30]], double [[TMP31]]) +; IR-DPP-NEXT: [[TMP33:%.*]] = select i1 [[TMP25]], double [[TMP30]], double [[TMP32]] +; IR-DPP-NEXT: br label [[TMP34]] +; IR-DPP: 34: +; IR-DPP-NEXT: [[TMP35:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP33]], [[TMP28]] ] +; IR-DPP-NEXT: ret double [[TMP35]] ; %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic ret double %result @@ -1274,9 +1594,89 @@ define amdgpu_ps double @global_atomic__fmax_double_uni_address_uni_value_agent_ } define amdgpu_ps double @global_atomic__fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, double %val) #1{ -; IR-LABEL: @global_atomic__fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp( -; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 -; IR-NEXT: ret double [[RESULT]] +; IR-ITERATIVE-LABEL: @global_atomic__fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[TMP23:%.*]] syncscope("agent") monotonic, align 8 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi double [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP13]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP14]], double [[TMP22:%.*]], metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = select i1 [[TMP28:%.*]], double [[TMP14]], double [[TMP15]] +; IR-ITERATIVE-NEXT: br label [[TMP17]] +; IR-ITERATIVE: 17: +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP16]], [[TMP12]] ] +; IR-ITERATIVE-NEXT: ret double [[TMP18]] +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ 0x7FF8000000000000, [[TMP2]] ], [ [[TMP23]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP26:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP20]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP22]] = call double @llvm.amdgcn.writelane.f64(double [[ACCUMULATOR]], i32 [[TMP20]], double [[OLDVALUEPHI]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP23]] = call double @llvm.experimental.constrained.maxnum.f64(double [[ACCUMULATOR]], double [[TMP21]], metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = shl i64 1, [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = xor i64 [[TMP24]], -1 +; IR-ITERATIVE-NEXT: [[TMP26]] = and i64 [[ACTIVEBITS]], [[TMP25]] +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i64 [[TMP26]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[COMPUTEEND]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP28]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP28]], label [[TMP10]], label [[TMP12]] +; +; IR-DPP-LABEL: @global_atomic__fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP34:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double 0x7FF8000000000000) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP9]], double [[TMP10]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP11]], double [[TMP12]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP13]], double [[TMP14]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP15]], double [[TMP16]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP17]], double [[TMP18]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP19]], double [[TMP20]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP21]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP23]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]] +; IR-DPP: 26: +; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[TMP24]] syncscope("agent") monotonic, align 8 +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: [[TMP29:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ] +; IR-DPP-NEXT: [[TMP30:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP29]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP31:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP32:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP30]], double [[TMP31]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP33:%.*]] = select i1 [[TMP25]], double [[TMP30]], double [[TMP32]] +; IR-DPP-NEXT: br label [[TMP34]] +; IR-DPP: 34: +; IR-DPP-NEXT: [[TMP35:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP33]], [[TMP28]] ] +; IR-DPP-NEXT: ret double [[TMP35]] ; %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic ret double %result @@ -1350,9 +1750,89 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_system_ } define amdgpu_ps double @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, double %val) #2 { -; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp( -; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] monotonic, align 4 -; IR-NEXT: ret double [[RESULT]] +; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP23:%.*]] monotonic, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi double [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP13]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP14]], double [[TMP22:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = select i1 [[TMP28:%.*]], double [[TMP14]], double [[TMP15]] +; IR-ITERATIVE-NEXT: br label [[TMP17]] +; IR-ITERATIVE: 17: +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP16]], [[TMP12]] ] +; IR-ITERATIVE-NEXT: ret double [[TMP18]] +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP23]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP26:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP20]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP22]] = call double @llvm.amdgcn.writelane.f64(double [[ACCUMULATOR]], i32 [[TMP20]], double [[OLDVALUEPHI]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP23]] = call double @llvm.experimental.constrained.fadd.f64(double [[ACCUMULATOR]], double [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = shl i64 1, [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = xor i64 [[TMP24]], -1 +; IR-ITERATIVE-NEXT: [[TMP26]] = and i64 [[ACTIVEBITS]], [[TMP25]] +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i64 [[TMP26]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[COMPUTEEND]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP28]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP28]], label [[TMP10]], label [[TMP12]] +; +; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP34:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP9]], double [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP11]], double [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP13]], double [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP15]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP17]], double [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP19]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP21]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP23]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]] +; IR-DPP: 26: +; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP24]] monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: [[TMP29:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ] +; IR-DPP-NEXT: [[TMP30:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP29]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP31:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP32:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP30]], double [[TMP31]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP33:%.*]] = select i1 [[TMP25]], double [[TMP30]], double [[TMP32]] +; IR-DPP-NEXT: br label [[TMP34]] +; IR-DPP: 34: +; IR-DPP-NEXT: [[TMP35:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP33]], [[TMP28]] ] +; IR-DPP-NEXT: ret double [[TMP35]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic, align 4 ret double %result diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll index dfc831cb5050a..c89be8063d9a8 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll @@ -852,9 +852,75 @@ define amdgpu_ps void @global_atomic_fadd_double_uni_address_uni_value_agent_sco } define amdgpu_ps void @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double %val) #0 { -; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe( -; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-NEXT: ret void +; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP17:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: br label [[TMP13]] +; IR-ITERATIVE: 13: +; IR-ITERATIVE-NEXT: ret void +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP15]]) +; IR-ITERATIVE-NEXT: [[TMP17]] = fadd double [[ACCUMULATOR]], [[TMP16]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1 +; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]] +; +; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00) +; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP11:%.*]] = fadd double [[TMP9]], [[TMP10]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP13:%.*]] = fadd double [[TMP11]], [[TMP12]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP15:%.*]] = fadd double [[TMP13]], [[TMP14]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP17:%.*]] = fadd double [[TMP15]], [[TMP16]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP19:%.*]] = fadd double [[TMP17]], [[TMP18]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP21:%.*]] = fadd double [[TMP19]], [[TMP20]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) +; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]] +; IR-DPP: 25: +; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP23]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP27]] +; IR-DPP: 27: +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: ret void ; %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic, align 4 ret void @@ -914,9 +980,75 @@ define amdgpu_ps void @global_atomic_fadd_double_uni_address_uni_value_one_as_sc } define amdgpu_ps void @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, double %val) #1 { -; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp( -; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("one-as") monotonic, align 8 -; IR-NEXT: ret void +; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP17:%.*]] syncscope("one-as") monotonic, align 8 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: br label [[TMP13]] +; IR-ITERATIVE: 13: +; IR-ITERATIVE-NEXT: ret void +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP15]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP17]] = call double @llvm.experimental.constrained.fadd.f64(double [[ACCUMULATOR]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1 +; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]] +; +; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP9]], double [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP11]], double [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP13]], double [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP15]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP17]], double [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP19]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]] +; IR-DPP: 25: +; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP23]] syncscope("one-as") monotonic, align 8 +; IR-DPP-NEXT: br label [[TMP27]] +; IR-DPP: 27: +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: ret void ; %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic ret void @@ -976,9 +1108,75 @@ define amdgpu_ps void @global_atomic_fsub_double_uni_address_uni_value_agent_sco } define amdgpu_ps void @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, double %val) #2 { -; IR-LABEL: @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp( -; IR-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 -; IR-NEXT: ret void +; IR-ITERATIVE-LABEL: @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[TMP17:%.*]] syncscope("agent") monotonic, align 8 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: br label [[TMP13]] +; IR-ITERATIVE: 13: +; IR-ITERATIVE-NEXT: ret void +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP15]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP17]] = call double @llvm.experimental.constrained.fadd.f64(double [[ACCUMULATOR]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1 +; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]] +; +; IR-DPP-LABEL: @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP9]], double [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP11]], double [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP13]], double [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP15]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP17]], double [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP19]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]] +; IR-DPP: 25: +; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[TMP23]] syncscope("agent") monotonic, align 8 +; IR-DPP-NEXT: br label [[TMP27]] +; IR-DPP: 27: +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: ret void ; %result = atomicrmw fsub ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic ret void @@ -1010,9 +1208,75 @@ define amdgpu_ps void @global_atomic_fmin_double_uni_address_uni_value_agent_sco } define amdgpu_ps void @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double %val) #0 { -; IR-LABEL: @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe( -; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 -; IR-NEXT: ret void +; IR-ITERATIVE-LABEL: @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[TMP17:%.*]] syncscope("agent") monotonic, align 8 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: br label [[TMP13]] +; IR-ITERATIVE: 13: +; IR-ITERATIVE-NEXT: ret void +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ 0x7FF8000000000000, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP15]]) +; IR-ITERATIVE-NEXT: [[TMP17]] = call double @llvm.minnum.f64(double [[ACCUMULATOR]], double [[TMP16]]) +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1 +; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]] +; +; IR-DPP-LABEL: @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double 0x7FF8000000000000) +; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.minnum.f64(double [[TMP9]], double [[TMP10]]) +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.minnum.f64(double [[TMP11]], double [[TMP12]]) +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.minnum.f64(double [[TMP13]], double [[TMP14]]) +; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.minnum.f64(double [[TMP15]], double [[TMP16]]) +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.minnum.f64(double [[TMP17]], double [[TMP18]]) +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.minnum.f64(double [[TMP19]], double [[TMP20]]) +; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) +; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]] +; IR-DPP: 25: +; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[TMP23]] syncscope("agent") monotonic, align 8 +; IR-DPP-NEXT: br label [[TMP27]] +; IR-DPP: 27: +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: ret void ; %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic ret void @@ -1064,9 +1328,75 @@ define amdgpu_ps void @global_atomic_fmax_double_uni_address_uni_value_agent_sco } define amdgpu_ps void @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, double %val) #1{ -; IR-LABEL: @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp( -; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 -; IR-NEXT: ret void +; IR-ITERATIVE-LABEL: @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[TMP17:%.*]] syncscope("agent") monotonic, align 8 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: br label [[TMP13]] +; IR-ITERATIVE: 13: +; IR-ITERATIVE-NEXT: ret void +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ 0x7FF8000000000000, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP15]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP17]] = call double @llvm.experimental.constrained.maxnum.f64(double [[ACCUMULATOR]], double [[TMP16]], metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1 +; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]] +; +; IR-DPP-LABEL: @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double 0x7FF8000000000000) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP9]], double [[TMP10]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP11]], double [[TMP12]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP13]], double [[TMP14]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP15]], double [[TMP16]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP17]], double [[TMP18]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP19]], double [[TMP20]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]] +; IR-DPP: 25: +; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[TMP23]] syncscope("agent") monotonic, align 8 +; IR-DPP-NEXT: br label [[TMP27]] +; IR-DPP: 27: +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: ret void ; %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic ret void @@ -1126,9 +1456,75 @@ define amdgpu_ps void @global_atomic_fadd_double_uni_address_uni_value_system_sc } define amdgpu_ps void @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, double %val) #2 { -; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp( -; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] monotonic, align 4 -; IR-NEXT: ret void +; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP17:%.*]] monotonic, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: br label [[TMP13]] +; IR-ITERATIVE: 13: +; IR-ITERATIVE-NEXT: ret void +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP15]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP17]] = call double @llvm.experimental.constrained.fadd.f64(double [[ACCUMULATOR]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1 +; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]] +; +; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP9]], double [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP11]], double [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP13]], double [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP15]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP17]], double [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP19]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]] +; IR-DPP: 25: +; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP23]] monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP27]] +; IR-DPP: 27: +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: ret void ; %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic, align 4 ret void diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index ab32efc4d3cd8..c05f9c679979d 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -5,7 +5,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s -; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s +; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s @@ -191,6 +191,42 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[4:5] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX7LESS-DPP-NEXT: .LBB0_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec @@ -337,19 +373,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -361,27 +395,51 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX7LESS-NEXT: .LBB1_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 +; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB1_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB1_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: .LBB1_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_4 +; GFX7LESS-NEXT: .LBB1_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe: @@ -676,6 +734,56 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: .LBB1_4: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB1_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -1305,6 +1413,52 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: .LBB2_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7LESS-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX7LESS-DPP-NEXT: .LBB2_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 @@ -1517,19 +1671,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -1541,27 +1693,51 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX7LESS-NEXT: .LBB3_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 +; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB3_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB3_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_4 +; GFX7LESS-NEXT: .LBB3_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: @@ -1882,6 +2058,56 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1132-NEXT: .LBB3_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -2537,6 +2763,52 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: .LBB4_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7LESS-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX7LESS-DPP-NEXT: .LBB4_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 @@ -2749,19 +3021,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -2773,27 +3043,51 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX7LESS-NEXT: .LBB5_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 +; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB5_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB5_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: .LBB5_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_4 +; GFX7LESS-NEXT: .LBB5_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe: @@ -3088,6 +3382,56 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: .LBB5_4: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -3471,19 +3815,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -3495,27 +3837,51 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX7LESS-NEXT: .LBB6_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 +; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB6_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB6_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: .LBB6_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_4 +; GFX7LESS-NEXT: .LBB6_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: @@ -3810,6 +4176,56 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: .LBB6_4: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB6_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -4439,6 +4855,52 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1132-NEXT: .LBB7_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7LESS-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB7_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX7LESS-DPP-NEXT: .LBB7_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 @@ -4650,19 +5112,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_default_scope_strictfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -4674,27 +5134,51 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX7LESS-NEXT: .LBB8_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 +; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB8_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB8_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: .LBB8_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_4 +; GFX7LESS-NEXT: .LBB8_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_default_scope_strictfp: @@ -5015,6 +5499,56 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1132-NEXT: .LBB8_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_div_value_default_scope_strictfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -5869,6 +6403,86 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, s0, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s1, v3 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX7LESS-DPP-NEXT: v_cvt_f64_u32_e32 v[1:2], s2 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v4, v0, v4 +; GFX7LESS-DPP-NEXT: v_mul_f64 v[41:42], v[1:2], 4.0 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v4, v3 +; GFX7LESS-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42] +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX7LESS-DPP-NEXT: .LBB9_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 @@ -6257,9 +6871,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] @@ -6269,25 +6880,49 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v42, v0, v2 +; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX7LESS-NEXT: s_mov_b32 s12, s41 ; GFX7LESS-NEXT: s_mov_b32 s13, s40 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v41, v1 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v41, 0 +; GFX7LESS-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX7LESS-NEXT: .LBB10_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 +; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 +; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB10_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB10_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 ; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 -; GFX7LESS-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX7LESS-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[40:41] +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42] ; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 @@ -6310,7 +6945,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b32 s12, s41 ; GFX7LESS-NEXT: s_mov_b32 s13, s40 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) @@ -6324,8 +6959,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_4 +; GFX7LESS-NEXT: .LBB10_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -6347,31 +6982,51 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s40, s7 ; GFX9-NEXT: s_mov_b32 s41, s6 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-NEXT: s_mov_b32 s12, s41 ; GFX9-NEXT: s_mov_b32 s13, s40 ; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: v_mov_b32_e32 v43, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: v_mov_b32_e32 v41, v1 -; GFX9-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_mov_b32_e32 v41, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX9-NEXT: .LBB10_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB10_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b64 s[44:45], 0 -; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX9-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] ; GFX9-NEXT: s_add_u32 s8, s36, 44 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] @@ -6388,7 +7043,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s12, s41 ; GFX9-NEXT: s_mov_b32 s13, s40 ; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -6406,8 +7061,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB10_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_cbranch_execnz .LBB10_4 +; GFX9-NEXT: .LBB10_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -6429,31 +7084,51 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s40, s7 ; GFX1064-NEXT: s_mov_b32 s41, s6 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-NEXT: s_mov_b32 s12, s41 ; GFX1064-NEXT: s_mov_b32 s13, s40 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_mov_b32_e32 v43, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: v_mov_b32_e32 v41, v1 -; GFX1064-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: v_mov_b32_e32 v41, 0 +; GFX1064-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB10_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] ; GFX1064-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-NEXT: s_getpc_b64 s[0:1] @@ -6462,7 +7137,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s42 @@ -6489,8 +7164,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_cbranch_execnz .LBB10_4 +; GFX1064-NEXT: .LBB10_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -6512,31 +7187,50 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s40, s7 ; GFX1032-NEXT: s_mov_b32 s41, s6 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-NEXT: s_mov_b32 s12, s41 ; GFX1032-NEXT: s_mov_b32 s13, s40 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: v_mov_b32_e32 v43, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: v_mov_b32_e32 v41, v1 -; GFX1032-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: v_mov_b32_e32 v41, 0 +; GFX1032-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s44, 0 -; GFX1032-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB10_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1032-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] ; GFX1032-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-NEXT: s_getpc_b64 s[0:1] @@ -6545,7 +7239,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: v_mov_b32_e32 v2, s42 @@ -6572,8 +7266,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_cbranch_execnz .LBB10_4 +; GFX1032-NEXT: .LBB10_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -6586,9 +7280,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -6596,29 +7289,53 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: s_mov_b32 s32, 32 -; GFX1164-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: s_mov_b32 s40, s7 ; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: v_mov_b32_e32 v43, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: v_mov_b32_e32 v41, v1 -; GFX1164-NEXT: global_load_b64 v[1:2], v43, s[42:43] -; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: v_mov_b32_e32 v41, 0 +; GFX1164-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB10_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 -; GFX1164-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1164-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] ; GFX1164-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 @@ -6644,8 +7361,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_cbranch_execnz .LBB10_4 +; GFX1164-NEXT: .LBB10_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; @@ -6660,7 +7377,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-NEXT: s_mov_b32 s40, s14 ; GFX1132-NEXT: s_mov_b32 s41, s13 @@ -6671,24 +7387,49 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 -; GFX1132-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-NEXT: v_mov_b32_e32 v41, 0 +; GFX1132-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB10_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 -; GFX1132-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1132-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 @@ -6710,11 +7451,97 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_cbranch_execnz .LBB10_4 +; GFX1132-NEXT: .LBB10_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v42, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], v[40:41] +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 @@ -6734,31 +7561,86 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s40, s7 ; GFX9-DPP-NEXT: s_mov_b32 s41, s6 ; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-DPP-NEXT: s_mov_b32 s12, s41 ; GFX9-DPP-NEXT: s_mov_b32 s13, s40 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX9-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s43, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s42, v8, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], s[42:43] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] @@ -6775,12 +7657,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s12, s41 ; GFX9-DPP-NEXT: s_mov_b32 s13, s40 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -6791,10 +7673,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX9-DPP-NEXT: .LBB10_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -6816,31 +7698,78 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v9, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v8, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v9, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v8, 32 +; GFX1064-DPP-NEXT: v_add_f64 v[8:9], s[2:3], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] @@ -6849,7 +7778,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 @@ -6876,8 +7805,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1064-DPP-NEXT: .LBB10_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -6899,31 +7828,72 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1032-DPP-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] @@ -6932,7 +7902,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 @@ -6959,8 +7929,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1032-DPP-NEXT: .LBB10_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -6973,9 +7943,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -6983,29 +7952,87 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_permlane64_b32 v11, v9 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v10, v8 +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 -; GFX1164-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 @@ -7031,8 +8058,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1164-DPP-NEXT: .LBB10_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; @@ -7047,7 +8074,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 @@ -7058,24 +8084,74 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 -; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 -; GFX1132-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -7097,8 +8173,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1132-DPP-NEXT: .LBB10_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.float.value() @@ -7364,6 +8440,55 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1132-NEXT: .LBB11_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7LESS-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX7LESS-DPP-NEXT: .LBB11_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 @@ -7579,19 +8704,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -7603,30 +8726,56 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX7LESS-NEXT: .LBB12_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 +; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 +; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB12_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB12_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: .LBB12_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_4 +; GFX7LESS-NEXT: .LBB12_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: @@ -7637,10 +8786,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -7655,24 +8804,47 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-NEXT: .LBB12_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB12_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB12_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX9-NEXT: .LBB12_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB12_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB12_4 +; GFX9-NEXT: .LBB12_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: @@ -7682,43 +8854,66 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB12_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB12_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB12_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1064-NEXT: .LBB12_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB12_4 +; GFX1064-NEXT: .LBB12_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: @@ -7728,117 +8923,245 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB12_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB12_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB12_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1032-NEXT: .LBB12_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB12_4 +; GFX1032-NEXT: .LBB12_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB12_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB12_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1164-NEXT: .LBB12_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB12_4 +; GFX1164-NEXT: .LBB12_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB12_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB12_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB12_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1132-NEXT: .LBB12_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB12_4 +; GFX1132-NEXT: .LBB12_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -7847,10 +9170,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -7865,26 +9188,85 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-DPP-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-DPP-NEXT: s_endpgm -; +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] +; GFX9-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], s[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX9-DPP-NEXT: .LBB12_3: +; GFX9-DPP-NEXT: s_endpgm +; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1064-DPP: ; %bb.0: ; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -7892,43 +9274,93 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1064-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1064-DPP-NEXT: .LBB12_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: @@ -7938,115 +9370,271 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1032-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1032-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1032-DPP-NEXT: .LBB12_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1164-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1164-DPP-NEXT: .LBB12_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1132-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1132-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1132-DPP-NEXT: .LBB12_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() strictfp %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic @@ -8311,6 +9899,55 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-NEXT: .LBB13_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7LESS-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX7LESS-DPP-NEXT: .LBB13_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 @@ -8526,19 +10163,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -8550,30 +10185,56 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX7LESS-NEXT: .LBB14_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 +; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 +; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB14_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB14_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: .LBB14_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB14_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB14_4 +; GFX7LESS-NEXT: .LBB14_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: @@ -8584,10 +10245,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -8602,24 +10263,47 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-NEXT: .LBB14_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB14_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX9-NEXT: .LBB14_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB14_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB14_4 +; GFX9-NEXT: .LBB14_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: @@ -8629,43 +10313,66 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB14_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1064-NEXT: .LBB14_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB14_4 +; GFX1064-NEXT: .LBB14_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: @@ -8675,117 +10382,245 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB14_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1032-NEXT: .LBB14_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: -; GFX1164: ; %bb.0: +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB14_4 +; GFX1032-NEXT: .LBB14_5: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB14_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1164-NEXT: .LBB14_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB14_4 +; GFX1164-NEXT: .LBB14_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB14_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1132-NEXT: .LBB14_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB14_4 +; GFX1132-NEXT: .LBB14_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -8794,10 +10629,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -8812,24 +10647,83 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] +; GFX9-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], s[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB14_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX9-DPP-NEXT: .LBB14_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: @@ -8839,43 +10733,93 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1064-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX1064-DPP-NEXT: .LBB14_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: @@ -8885,115 +10829,271 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1032-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1032-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX1032-DPP-NEXT: .LBB14_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1164-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX1164-DPP-NEXT: .LBB14_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1132-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1132-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX1132-DPP-NEXT: .LBB14_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic @@ -9004,19 +11104,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -9028,30 +11126,56 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX7LESS-NEXT: .LBB15_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 +; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 +; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB15_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB15_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: .LBB15_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB15_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB15_4 +; GFX7LESS-NEXT: .LBB15_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: @@ -9062,10 +11186,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -9080,70 +11204,116 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-NEXT: .LBB15_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB15_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: -; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB15_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX9-NEXT: .LBB15_4: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB15_4 +; GFX9-NEXT: .LBB15_5: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB15_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1064-NEXT: .LBB15_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB15_4 +; GFX1064-NEXT: .LBB15_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: @@ -9153,117 +11323,245 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB15_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1032-NEXT: .LBB15_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB15_4 +; GFX1032-NEXT: .LBB15_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB15_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1164-NEXT: .LBB15_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB15_4 +; GFX1164-NEXT: .LBB15_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB15_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1132-NEXT: .LBB15_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB15_4 +; GFX1132-NEXT: .LBB15_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -9272,10 +11570,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -9290,24 +11588,83 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] +; GFX9-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], s[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB15_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX9-DPP-NEXT: .LBB15_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: @@ -9317,43 +11674,93 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1064-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1064-DPP-NEXT: .LBB15_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: @@ -9363,115 +11770,271 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1032-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1032-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1032-DPP-NEXT: .LBB15_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1164-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1164-DPP-NEXT: .LBB15_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1132-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1132-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1132-DPP-NEXT: .LBB15_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.float.value() strictfp %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic @@ -9944,6 +12507,88 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v5, exec_lo, 0 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX7LESS-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0xc3300000 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v5, exec_hi, v5 +; GFX7LESS-DPP-NEXT: v_add_f64 v[3:4], s[0:1], v[3:4] +; GFX7LESS-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v3, v2 +; GFX7LESS-DPP-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42] +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX7LESS-DPP-NEXT: .LBB16_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 @@ -10346,9 +12991,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] @@ -10358,25 +13000,49 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v42, v0, v2 +; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX7LESS-NEXT: s_mov_b32 s12, s41 ; GFX7LESS-NEXT: s_mov_b32 s13, s40 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v41, v1 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v41, 0 +; GFX7LESS-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX7LESS-NEXT: .LBB17_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 +; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 +; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB17_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB17_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 ; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 -; GFX7LESS-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX7LESS-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[40:41] +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42] ; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 @@ -10399,7 +13065,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_mov_b32 s12, s41 ; GFX7LESS-NEXT: s_mov_b32 s13, s40 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) @@ -10413,8 +13079,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_4 +; GFX7LESS-NEXT: .LBB17_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: @@ -10436,31 +13102,51 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-NEXT: s_mov_b32 s40, s7 ; GFX9-NEXT: s_mov_b32 s41, s6 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-NEXT: s_mov_b32 s12, s41 ; GFX9-NEXT: s_mov_b32 s13, s40 ; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: v_mov_b32_e32 v43, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: v_mov_b32_e32 v41, v1 -; GFX9-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_mov_b32_e32 v41, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX9-NEXT: .LBB17_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB17_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b64 s[44:45], 0 -; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX9-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] ; GFX9-NEXT: s_add_u32 s8, s36, 44 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] @@ -10477,7 +13163,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-NEXT: s_mov_b32 s12, s41 ; GFX9-NEXT: s_mov_b32 s13, s40 ; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -10495,8 +13181,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB17_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_cbranch_execnz .LBB17_4 +; GFX9-NEXT: .LBB17_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: @@ -10518,31 +13204,51 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b32 s40, s7 ; GFX1064-NEXT: s_mov_b32 s41, s6 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-NEXT: s_mov_b32 s12, s41 ; GFX1064-NEXT: s_mov_b32 s13, s40 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_mov_b32_e32 v43, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: v_mov_b32_e32 v41, v1 -; GFX1064-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: v_mov_b32_e32 v41, 0 +; GFX1064-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB17_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] ; GFX1064-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-NEXT: s_getpc_b64 s[0:1] @@ -10551,7 +13257,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s42 @@ -10578,8 +13284,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_cbranch_execnz .LBB17_4 +; GFX1064-NEXT: .LBB17_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: @@ -10601,31 +13307,50 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-NEXT: s_mov_b32 s40, s7 ; GFX1032-NEXT: s_mov_b32 s41, s6 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-NEXT: s_mov_b32 s12, s41 ; GFX1032-NEXT: s_mov_b32 s13, s40 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: v_mov_b32_e32 v43, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: v_mov_b32_e32 v41, v1 -; GFX1032-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: v_mov_b32_e32 v41, 0 +; GFX1032-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s44, 0 -; GFX1032-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB17_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1032-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] ; GFX1032-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-NEXT: s_getpc_b64 s[0:1] @@ -10634,7 +13359,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: v_mov_b32_e32 v2, s42 @@ -10661,8 +13386,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_cbranch_execnz .LBB17_4 +; GFX1032-NEXT: .LBB17_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: @@ -10675,9 +13400,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -10685,29 +13409,53 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: s_mov_b32 s32, 32 -; GFX1164-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: s_mov_b32 s40, s7 ; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: v_mov_b32_e32 v43, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: v_mov_b32_e32 v41, v1 -; GFX1164-NEXT: global_load_b64 v[1:2], v43, s[42:43] -; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: v_mov_b32_e32 v41, 0 +; GFX1164-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB17_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 -; GFX1164-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1164-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] ; GFX1164-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 @@ -10733,8 +13481,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_cbranch_execnz .LBB17_4 +; GFX1164-NEXT: .LBB17_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; @@ -10749,7 +13497,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-NEXT: s_mov_b32 s40, s14 ; GFX1132-NEXT: s_mov_b32 s41, s13 @@ -10760,24 +13507,49 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 -; GFX1132-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-NEXT: v_mov_b32_e32 v41, 0 +; GFX1132-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB17_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 -; GFX1132-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1132-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 @@ -10799,11 +13571,97 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_cbranch_execnz .LBB17_4 +; GFX1132-NEXT: .LBB17_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v42, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], v[40:41] +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 @@ -10823,31 +13681,86 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_mov_b32 s40, s7 ; GFX9-DPP-NEXT: s_mov_b32 s41, s6 ; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-DPP-NEXT: s_mov_b32 s12, s41 ; GFX9-DPP-NEXT: s_mov_b32 s13, s40 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX9-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s43, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s42, v8, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX9-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], s[42:43] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] @@ -10864,12 +13777,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_mov_b32 s12, s41 ; GFX9-DPP-NEXT: s_mov_b32 s13, s40 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -10880,10 +13793,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX9-DPP-NEXT: .LBB17_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: @@ -10905,31 +13818,78 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v9, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v8, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v9, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v8, 32 +; GFX1064-DPP-NEXT: v_add_f64 v[8:9], s[2:3], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] @@ -10938,7 +13898,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 @@ -10965,8 +13925,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX1064-DPP-NEXT: .LBB17_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: @@ -10988,31 +13948,72 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1032-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1032-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] @@ -11021,7 +14022,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 @@ -11048,8 +14049,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX1032-DPP-NEXT: .LBB17_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: @@ -11062,9 +14063,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -11072,29 +14072,87 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_permlane64_b32 v11, v9 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v10, v8 +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 -; GFX1164-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 @@ -11120,8 +14178,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX1164-DPP-NEXT: .LBB17_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; @@ -11136,7 +14194,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 @@ -11147,24 +14204,74 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 -; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 -; GFX1132-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -11186,8 +14293,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX1132-DPP-NEXT: .LBB17_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.float.value() strictfp @@ -11397,6 +14504,42 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1132-NEXT: .LBB18_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB18_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[4:5] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB18_2 +; GFX7LESS-DPP-NEXT: .LBB18_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec @@ -11767,6 +14910,42 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1132-NEXT: .LBB19_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB19_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[4:5] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB19_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB19_2 +; GFX7LESS-DPP-NEXT: .LBB19_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index a13e704a1a5fc..46f0bb0393885 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -5,7 +5,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s -; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s +; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s @@ -145,6 +145,39 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX7LESS-DPP-NEXT: .LBB0_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -248,19 +281,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX7LESS-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -272,14 +303,40 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GFX7LESS-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX7LESS-NEXT: .LBB1_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 +; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_max_f32_e32 v2, v1, v2 +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB1_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB1_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7LESS-NEXT: .LBB1_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 @@ -287,14 +344,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_4 +; GFX7LESS-NEXT: .LBB1_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe: @@ -577,6 +634,58 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: .LBB1_4: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GFX7LESS-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB1_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -1153,6 +1262,39 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: .LBB2_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX7LESS-DPP-NEXT: .LBB2_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1310,19 +1452,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX7LESS-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -1334,14 +1474,40 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GFX7LESS-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX7LESS-NEXT: .LBB3_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 +; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_max_f32_e32 v2, v1, v2 +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB3_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB3_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7LESS-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 @@ -1349,14 +1515,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_4 +; GFX7LESS-NEXT: .LBB3_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe: @@ -1698,6 +1864,58 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1132-NEXT: .LBB3_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GFX7LESS-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -2335,6 +2553,39 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1132-NEXT: .LBB4_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX7LESS-DPP-NEXT: .LBB4_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2491,19 +2742,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX7LESS-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -2515,14 +2764,40 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GFX7LESS-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX7LESS-NEXT: .LBB5_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 +; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_max_f32_e32 v2, v1, v2 +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB5_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB5_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7LESS-NEXT: .LBB5_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 @@ -2530,14 +2805,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_4 +; GFX7LESS-NEXT: .LBB5_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe: @@ -2879,6 +3154,58 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1132-NEXT: .LBB5_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GFX7LESS-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -3763,6 +4090,83 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v3, v2 +; GFX7LESS-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX7LESS-DPP-NEXT: .LBB6_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 @@ -4135,9 +4539,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] @@ -4158,17 +4559,46 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX7LESS-NEXT: .LBB7_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 +; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 +; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB7_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB7_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 ; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 -; GFX7LESS-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX7LESS-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] +; GFX7LESS-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[41:42] ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -4195,14 +4625,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_4 +; GFX7LESS-NEXT: .LBB7_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: @@ -4224,7 +4654,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s40, s7 ; GFX9-NEXT: s_mov_b32 s41, s6 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] @@ -4238,16 +4667,41 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: v_mov_b32_e32 v41, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX9-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX9-NEXT: .LBB7_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB7_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX9-NEXT: s_mov_b64 s[44:45], 0 -; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[42:43] +; GFX9-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX9-NEXT: s_add_u32 s8, s36, 44 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] @@ -4255,8 +4709,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 ; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] @@ -4277,14 +4731,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB7_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_cbranch_execnz .LBB7_4 +; GFX9-NEXT: .LBB7_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: @@ -4306,7 +4760,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s40, s7 ; GFX1064-NEXT: s_mov_b32 s41, s6 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] @@ -4320,28 +4773,53 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_mov_b32_e32 v41, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1064-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_max_f64 v[2:3], v[3:4], v[3:4] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_max_f64 v[3:4], v[2:3], v[4:5] +; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB7_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1064-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1064-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 @@ -4352,22 +4830,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s13, s40 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_cbranch_execnz .LBB7_4 +; GFX1064-NEXT: .LBB7_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: @@ -4389,7 +4867,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s40, s7 ; GFX1032-NEXT: s_mov_b32 s41, s6 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] @@ -4403,23 +4880,47 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: v_mov_b32_e32 v41, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1032-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s44, 0 -; GFX1032-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB7_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[42:43] +; GFX1032-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1032-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v2, s42 @@ -4443,14 +4944,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v4, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_cbranch_execnz .LBB7_4 +; GFX1032-NEXT: .LBB7_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: @@ -4463,9 +4964,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -4476,18 +4976,47 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: s_mov_b32 s40, s7 ; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: v_mov_b32_e32 v41, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1164-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB7_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[42:43] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 -; GFX1164-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1164-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1164-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-NEXT: s_getpc_b64 s[0:1] @@ -4495,8 +5024,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 ; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] @@ -4506,22 +5035,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] ; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off ; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-NEXT: scratch_load_b64 v[4:5], off, off ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_cbranch_execnz .LBB7_4 +; GFX1164-NEXT: .LBB7_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; @@ -4536,7 +5065,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-NEXT: s_mov_b32 s40, s14 ; GFX1132-NEXT: s_mov_b32 s41, s13 @@ -4547,18 +5075,48 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1132-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB7_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[42:43] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 -; GFX1132-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1132-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] @@ -4566,33 +5124,120 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v40 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v3, s43 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s41 ; GFX1132-NEXT: s_mov_b32 s13, s40 ; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s42 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] ; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off ; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 -; GFX1132-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-NEXT: v_mov_b32_e32 v5, 8 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-NEXT: scratch_load_b64 v[4:5], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_cbranch_execnz .LBB7_4 +; GFX1132-NEXT: .LBB7_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7LESS-DPP-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 @@ -4612,7 +5257,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s40, s7 ; GFX9-DPP-NEXT: s_mov_b32 s41, s6 ; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] @@ -4626,16 +5270,81 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX9-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX9-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s43, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s42, v8, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f64 v[3:4], s[42:43], s[42:43] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] @@ -4643,36 +5352,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-DPP-NEXT: s_mov_b32 s12, s41 ; GFX9-DPP-NEXT: s_mov_b32 s13, s40 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX9-DPP-NEXT: .LBB7_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: @@ -4694,7 +5403,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] @@ -4708,30 +5416,86 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1064-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1064-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v9, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v9, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v8, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v8, 0 +; GFX1064-DPP-NEXT: v_max_f64 v[8:9], s[4:5], s[4:5] +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], s[2:3], s[2:3] +; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] @@ -4740,22 +5504,24 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1064-DPP-NEXT: .LBB7_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: @@ -4777,7 +5543,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] @@ -4791,28 +5556,78 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1032-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1032-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -4823,22 +5638,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1032-DPP-NEXT: .LBB7_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: @@ -4851,9 +5666,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -4864,18 +5678,89 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlane64_b32 v11, v9 +; GFX1164-DPP-NEXT: v_permlane64_b32 v10, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 -; GFX1164-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] @@ -4883,7 +5768,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -4892,24 +5777,24 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] ; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[2:3], off -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1164-DPP-NEXT: .LBB7_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; @@ -4924,7 +5809,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 @@ -4935,24 +5819,83 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 -; GFX1132-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -4961,23 +5904,23 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] ; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[2:3], off -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1132-DPP-NEXT: .LBB7_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() @@ -5180,6 +6123,43 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1132-NEXT: .LBB8_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s7 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX7LESS-DPP-NEXT: .LBB8_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5344,19 +6324,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -5368,14 +6346,42 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX7LESS-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX7LESS-NEXT: .LBB9_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 +; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 +; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB9_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB9_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX7LESS-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -5385,15 +6391,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_4 +; GFX7LESS-NEXT: .LBB9_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: @@ -5404,10 +6410,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -5422,26 +6428,51 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX9-NEXT: .LBB9_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB9_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX9-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB9_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB9_4 +; GFX9-NEXT: .LBB9_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: @@ -5451,45 +6482,70 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB9_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1064-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB9_4 +; GFX1064-NEXT: .LBB9_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: @@ -5499,125 +6555,263 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB9_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1032-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB9_4 +; GFX1032-NEXT: .LBB9_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB9_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1164-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB9_4 +; GFX1164-NEXT: .LBB9_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB9_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1132-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB9_4 +; GFX1132-NEXT: .LBB9_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -5626,10 +6820,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -5644,26 +6838,92 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] +; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f64 v[1:2], s[0:1], s[0:1] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12] +; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[9:10], v[1:2] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX9-DPP-NEXT: .LBB9_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: @@ -5673,45 +6933,103 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], s[4:5], s[4:5] +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], s[2:3], s[2:3] +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1064-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f64 v[9:10], v[0:1], v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1064-DPP-NEXT: v_max_f64 v[13:14], v[11:12], v[11:12] +; GFX1064-DPP-NEXT: v_max_f64 v[9:10], v[13:14], v[9:10] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1064-DPP-NEXT: .LBB9_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: @@ -5721,123 +7039,303 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1032-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1032-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1032-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12] +; GFX1032-DPP-NEXT: v_max_f64 v[9:10], v[9:10], v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1032-DPP-NEXT: .LBB9_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1164-DPP-NEXT: .LBB9_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1132-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1132-DPP-NEXT: .LBB9_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic @@ -6275,6 +7773,83 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v3, v2 +; GFX7LESS-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX7LESS-DPP-NEXT: .LBB10_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 @@ -6647,9 +8222,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] @@ -6670,17 +8242,46 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX7LESS-NEXT: .LBB11_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 +; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 +; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB11_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB11_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 ; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 -; GFX7LESS-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX7LESS-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] +; GFX7LESS-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[41:42] ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -6707,14 +8308,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_4 +; GFX7LESS-NEXT: .LBB11_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: @@ -6736,7 +8337,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: s_mov_b32 s40, s7 ; GFX9-NEXT: s_mov_b32 s41, s6 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] @@ -6750,16 +8350,41 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: v_mov_b32_e32 v41, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX9-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX9-NEXT: .LBB11_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB11_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX9-NEXT: s_mov_b64 s[44:45], 0 -; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[42:43] +; GFX9-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX9-NEXT: s_add_u32 s8, s36, 44 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] @@ -6767,8 +8392,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 ; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] @@ -6789,14 +8414,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB11_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: .LBB11_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: @@ -6818,7 +8443,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b32 s40, s7 ; GFX1064-NEXT: s_mov_b32 s41, s6 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] @@ -6832,28 +8456,53 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_mov_b32_e32 v41, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1064-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_max_f64 v[2:3], v[3:4], v[3:4] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_max_f64 v[3:4], v[2:3], v[4:5] +; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB11_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1064-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1064-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 @@ -6864,22 +8513,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b32 s13, s40 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_cbranch_execnz .LBB11_4 +; GFX1064-NEXT: .LBB11_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: @@ -6901,7 +8550,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-NEXT: s_mov_b32 s40, s7 ; GFX1032-NEXT: s_mov_b32 s41, s6 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] @@ -6915,23 +8563,47 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: v_mov_b32_e32 v41, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1032-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s44, 0 -; GFX1032-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB11_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[42:43] +; GFX1032-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1032-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v2, s42 @@ -6955,14 +8627,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v4, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_cbranch_execnz .LBB11_4 +; GFX1032-NEXT: .LBB11_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: @@ -6975,9 +8647,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -6988,18 +8659,47 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: s_mov_b32 s40, s7 ; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: v_mov_b32_e32 v41, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1164-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB11_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[42:43] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 -; GFX1164-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1164-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1164-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-NEXT: s_getpc_b64 s[0:1] @@ -7007,8 +8707,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 ; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] @@ -7018,22 +8718,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] ; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off ; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-NEXT: scratch_load_b64 v[4:5], off, off ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_cbranch_execnz .LBB11_4 +; GFX1164-NEXT: .LBB11_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; @@ -7048,7 +8748,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-NEXT: s_mov_b32 s40, s14 ; GFX1132-NEXT: s_mov_b32 s41, s13 @@ -7059,18 +8758,48 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1132-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB11_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[42:43] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 -; GFX1132-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1132-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] @@ -7078,33 +8807,120 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v40 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v3, s43 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s41 ; GFX1132-NEXT: s_mov_b32 s13, s40 ; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s42 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] ; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off ; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 -; GFX1132-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-NEXT: v_mov_b32_e32 v5, 8 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-NEXT: scratch_load_b64 v[4:5], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_cbranch_execnz .LBB11_4 +; GFX1132-NEXT: .LBB11_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7LESS-DPP-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 @@ -7124,7 +8940,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_mov_b32 s40, s7 ; GFX9-DPP-NEXT: s_mov_b32 s41, s6 ; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] @@ -7138,16 +8953,81 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX9-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX9-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s43, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s42, v8, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f64 v[3:4], s[42:43], s[42:43] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] @@ -7155,36 +9035,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-DPP-NEXT: s_mov_b32 s12, s41 ; GFX9-DPP-NEXT: s_mov_b32 s13, s40 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX9-DPP-NEXT: .LBB11_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: @@ -7206,7 +9086,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] @@ -7220,30 +9099,86 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1064-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1064-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v9, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v9, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v8, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v8, 0 +; GFX1064-DPP-NEXT: v_max_f64 v[8:9], s[4:5], s[4:5] +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], s[2:3], s[2:3] +; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] @@ -7252,22 +9187,24 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1064-DPP-NEXT: .LBB11_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: @@ -7289,7 +9226,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] @@ -7303,28 +9239,78 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1032-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1032-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -7335,22 +9321,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1032-DPP-NEXT: .LBB11_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: @@ -7363,9 +9349,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -7376,18 +9361,89 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlane64_b32 v11, v9 +; GFX1164-DPP-NEXT: v_permlane64_b32 v10, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 -; GFX1164-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] @@ -7395,7 +9451,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -7404,24 +9460,24 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] ; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[2:3], off -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1164-DPP-NEXT: .LBB11_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; @@ -7436,7 +9492,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 @@ -7447,24 +9502,83 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 -; GFX1132-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -7473,23 +9587,23 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] ; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[2:3], off -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1132-DPP-NEXT: .LBB11_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() @@ -7680,6 +9794,39 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-NEXT: .LBB12_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX7LESS-DPP-NEXT: .LBB12_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8015,6 +10162,39 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-NEXT: .LBB13_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX7LESS-DPP-NEXT: .LBB13_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index 65d0b9eafdf82..bd5e589ec2be7 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -5,7 +5,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s -; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s +; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s @@ -145,6 +145,39 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX7LESS-DPP-NEXT: .LBB0_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -248,19 +281,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX7LESS-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -272,14 +303,40 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GFX7LESS-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX7LESS-NEXT: .LBB1_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 +; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_min_f32_e32 v2, v1, v2 +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB1_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB1_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7LESS-NEXT: .LBB1_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 @@ -287,14 +344,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_4 +; GFX7LESS-NEXT: .LBB1_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe: @@ -577,6 +634,58 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: .LBB1_4: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GFX7LESS-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB1_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -1153,6 +1262,39 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: .LBB2_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX7LESS-DPP-NEXT: .LBB2_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1310,19 +1452,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX7LESS-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -1334,14 +1474,40 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GFX7LESS-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX7LESS-NEXT: .LBB3_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 +; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_min_f32_e32 v2, v1, v2 +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB3_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB3_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7LESS-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 @@ -1349,14 +1515,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_4 +; GFX7LESS-NEXT: .LBB3_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe: @@ -1698,6 +1864,58 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1132-NEXT: .LBB3_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GFX7LESS-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -2335,6 +2553,39 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1132-NEXT: .LBB4_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX7LESS-DPP-NEXT: .LBB4_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2491,19 +2742,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX7LESS-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -2515,14 +2764,40 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GFX7LESS-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX7LESS-NEXT: .LBB5_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 +; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_min_f32_e32 v2, v1, v2 +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB5_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB5_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7LESS-NEXT: .LBB5_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 @@ -2530,14 +2805,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_4 +; GFX7LESS-NEXT: .LBB5_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe: @@ -2879,6 +3154,58 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1132-NEXT: .LBB5_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GFX7LESS-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -3763,6 +4090,83 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v3, v2 +; GFX7LESS-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX7LESS-DPP-NEXT: .LBB6_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 @@ -4135,9 +4539,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] @@ -4158,17 +4559,46 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX7LESS-NEXT: .LBB7_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 +; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 +; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB7_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB7_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 ; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 -; GFX7LESS-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX7LESS-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] +; GFX7LESS-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], v[41:42] ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -4195,14 +4625,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_4 +; GFX7LESS-NEXT: .LBB7_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: @@ -4224,7 +4654,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s40, s7 ; GFX9-NEXT: s_mov_b32 s41, s6 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] @@ -4238,16 +4667,41 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: v_mov_b32_e32 v41, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX9-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX9-NEXT: .LBB7_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB7_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX9-NEXT: s_mov_b64 s[44:45], 0 -; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[42:43] +; GFX9-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX9-NEXT: s_add_u32 s8, s36, 44 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] @@ -4255,8 +4709,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 ; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] @@ -4277,14 +4731,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB7_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_cbranch_execnz .LBB7_4 +; GFX9-NEXT: .LBB7_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: @@ -4306,7 +4760,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s40, s7 ; GFX1064-NEXT: s_mov_b32 s41, s6 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] @@ -4320,28 +4773,53 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_mov_b32_e32 v41, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1064-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_max_f64 v[2:3], v[3:4], v[3:4] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_min_f64 v[3:4], v[2:3], v[4:5] +; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB7_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1064-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1064-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 @@ -4352,22 +4830,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s13, s40 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_cbranch_execnz .LBB7_4 +; GFX1064-NEXT: .LBB7_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: @@ -4389,7 +4867,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s40, s7 ; GFX1032-NEXT: s_mov_b32 s41, s6 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] @@ -4403,23 +4880,47 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: v_mov_b32_e32 v41, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1032-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s44, 0 -; GFX1032-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB7_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[42:43] +; GFX1032-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1032-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v2, s42 @@ -4443,14 +4944,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v4, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_cbranch_execnz .LBB7_4 +; GFX1032-NEXT: .LBB7_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: @@ -4463,9 +4964,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -4476,18 +4976,47 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: s_mov_b32 s40, s7 ; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: v_mov_b32_e32 v41, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1164-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB7_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[42:43] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 -; GFX1164-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1164-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1164-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-NEXT: s_getpc_b64 s[0:1] @@ -4495,8 +5024,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 ; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] @@ -4506,22 +5035,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] ; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off ; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-NEXT: scratch_load_b64 v[4:5], off, off ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_cbranch_execnz .LBB7_4 +; GFX1164-NEXT: .LBB7_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; @@ -4536,7 +5065,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-NEXT: s_mov_b32 s40, s14 ; GFX1132-NEXT: s_mov_b32 s41, s13 @@ -4547,18 +5075,48 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1132-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB7_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[42:43] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 -; GFX1132-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1132-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] @@ -4566,33 +5124,120 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v40 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v3, s43 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s41 ; GFX1132-NEXT: s_mov_b32 s13, s40 ; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s42 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] ; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off ; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 -; GFX1132-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-NEXT: v_mov_b32_e32 v5, 8 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-NEXT: scratch_load_b64 v[4:5], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_cbranch_execnz .LBB7_4 +; GFX1132-NEXT: .LBB7_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7LESS-DPP-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 @@ -4612,7 +5257,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s40, s7 ; GFX9-DPP-NEXT: s_mov_b32 s41, s6 ; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] @@ -4626,16 +5270,81 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX9-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX9-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s43, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s42, v8, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f64 v[3:4], s[42:43], s[42:43] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] @@ -4643,36 +5352,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-DPP-NEXT: s_mov_b32 s12, s41 ; GFX9-DPP-NEXT: s_mov_b32 s13, s40 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX9-DPP-NEXT: .LBB7_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: @@ -4694,7 +5403,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] @@ -4708,30 +5416,86 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1064-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1064-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1064-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1064-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1064-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1064-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1064-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v9, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v9, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v8, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v8, 0 +; GFX1064-DPP-NEXT: v_max_f64 v[8:9], s[4:5], s[4:5] +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], s[2:3], s[2:3] +; GFX1064-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] @@ -4740,22 +5504,24 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1064-DPP-NEXT: .LBB7_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: @@ -4777,7 +5543,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] @@ -4791,28 +5556,78 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1032-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1032-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1032-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1032-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1032-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -4823,22 +5638,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1032-DPP-NEXT: .LBB7_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: @@ -4851,9 +5666,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -4864,18 +5678,89 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1164-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1164-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1164-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlane64_b32 v11, v9 +; GFX1164-DPP-NEXT: v_permlane64_b32 v10, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 -; GFX1164-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] @@ -4883,7 +5768,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -4892,24 +5777,24 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] ; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[2:3], off -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1164-DPP-NEXT: .LBB7_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; @@ -4924,7 +5809,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 @@ -4935,24 +5819,83 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 -; GFX1132-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -4961,23 +5904,23 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] ; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[2:3], off -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1132-DPP-NEXT: .LBB7_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() @@ -5180,6 +6123,43 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1132-NEXT: .LBB8_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s7 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX7LESS-DPP-NEXT: .LBB8_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5344,19 +6324,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -5368,14 +6346,42 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX7LESS-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX7LESS-NEXT: .LBB9_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 +; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 +; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB9_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB9_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX7LESS-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -5385,15 +6391,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_4 +; GFX7LESS-NEXT: .LBB9_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: @@ -5404,10 +6410,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -5422,26 +6428,51 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX9-NEXT: .LBB9_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB9_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX9-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB9_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB9_4 +; GFX9-NEXT: .LBB9_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: @@ -5451,45 +6482,70 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB9_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1064-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB9_4 +; GFX1064-NEXT: .LBB9_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: @@ -5499,125 +6555,263 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB9_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1032-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB9_4 +; GFX1032-NEXT: .LBB9_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB9_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1164-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB9_4 +; GFX1164-NEXT: .LBB9_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB9_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1132-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB9_4 +; GFX1132-NEXT: .LBB9_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -5626,10 +6820,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -5644,26 +6838,92 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] +; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] +; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f64 v[1:2], s[0:1], s[0:1] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12] +; GFX9-DPP-NEXT: v_min_f64 v[9:10], v[9:10], v[1:2] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX9-DPP-NEXT: .LBB9_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: @@ -5673,45 +6933,103 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1064-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1064-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1064-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] +; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], s[4:5], s[4:5] +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], s[2:3], s[2:3] +; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1064-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f64 v[9:10], v[0:1], v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1064-DPP-NEXT: v_max_f64 v[13:14], v[11:12], v[11:12] +; GFX1064-DPP-NEXT: v_min_f64 v[9:10], v[13:14], v[9:10] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1064-DPP-NEXT: .LBB9_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: @@ -5721,123 +7039,303 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1032-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1032-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1032-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1032-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] +; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1032-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1032-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12] +; GFX1032-DPP-NEXT: v_min_f64 v[9:10], v[9:10], v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1032-DPP-NEXT: .LBB9_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1164-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1164-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1164-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1164-DPP-NEXT: .LBB9_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1132-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1132-DPP-NEXT: .LBB9_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic @@ -6275,6 +7773,83 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v3, v2 +; GFX7LESS-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX7LESS-DPP-NEXT: .LBB10_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 @@ -6647,9 +8222,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] @@ -6670,17 +8242,46 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX7LESS-NEXT: .LBB11_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 +; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 +; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB11_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB11_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 ; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 -; GFX7LESS-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX7LESS-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] +; GFX7LESS-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], v[41:42] ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -6707,14 +8308,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_4 +; GFX7LESS-NEXT: .LBB11_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: @@ -6736,7 +8337,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: s_mov_b32 s40, s7 ; GFX9-NEXT: s_mov_b32 s41, s6 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] @@ -6750,16 +8350,41 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: v_mov_b32_e32 v41, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX9-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX9-NEXT: .LBB11_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB11_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX9-NEXT: s_mov_b64 s[44:45], 0 -; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[42:43] +; GFX9-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX9-NEXT: s_add_u32 s8, s36, 44 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] @@ -6767,8 +8392,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 ; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] @@ -6789,14 +8414,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB11_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: .LBB11_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: @@ -6818,7 +8443,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b32 s40, s7 ; GFX1064-NEXT: s_mov_b32 s41, s6 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] @@ -6832,28 +8456,53 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_mov_b32_e32 v41, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1064-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_max_f64 v[2:3], v[3:4], v[3:4] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_min_f64 v[3:4], v[2:3], v[4:5] +; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB11_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1064-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1064-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 @@ -6864,22 +8513,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b32 s13, s40 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_cbranch_execnz .LBB11_4 +; GFX1064-NEXT: .LBB11_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: @@ -6901,7 +8550,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-NEXT: s_mov_b32 s40, s7 ; GFX1032-NEXT: s_mov_b32 s41, s6 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] @@ -6915,23 +8563,47 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: v_mov_b32_e32 v41, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1032-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s44, 0 -; GFX1032-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB11_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[42:43] +; GFX1032-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1032-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v2, s42 @@ -6955,14 +8627,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v4, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_cbranch_execnz .LBB11_4 +; GFX1032-NEXT: .LBB11_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: @@ -6975,9 +8647,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -6988,18 +8659,47 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: s_mov_b32 s40, s7 ; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: v_mov_b32_e32 v41, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1164-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB11_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[42:43] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 -; GFX1164-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1164-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1164-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-NEXT: s_getpc_b64 s[0:1] @@ -7007,8 +8707,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 ; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] @@ -7018,22 +8718,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] ; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off ; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-NEXT: scratch_load_b64 v[4:5], off, off ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_cbranch_execnz .LBB11_4 +; GFX1164-NEXT: .LBB11_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; @@ -7048,7 +8748,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-NEXT: s_mov_b32 s40, s14 ; GFX1132-NEXT: s_mov_b32 s41, s13 @@ -7059,18 +8758,48 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1132-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB11_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[42:43] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 -; GFX1132-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1132-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] @@ -7078,33 +8807,120 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v40 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v3, s43 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s41 ; GFX1132-NEXT: s_mov_b32 s13, s40 ; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s42 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] ; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off ; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 -; GFX1132-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-NEXT: v_mov_b32_e32 v5, 8 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-NEXT: scratch_load_b64 v[4:5], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_cbranch_execnz .LBB11_4 +; GFX1132-NEXT: .LBB11_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7LESS-DPP-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 @@ -7124,7 +8940,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_mov_b32 s40, s7 ; GFX9-DPP-NEXT: s_mov_b32 s41, s6 ; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] @@ -7138,16 +8953,81 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX9-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX9-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s43, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s42, v8, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f64 v[3:4], s[42:43], s[42:43] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] @@ -7155,36 +9035,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-DPP-NEXT: s_mov_b32 s12, s41 ; GFX9-DPP-NEXT: s_mov_b32 s13, s40 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX9-DPP-NEXT: .LBB11_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: @@ -7206,7 +9086,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] @@ -7220,30 +9099,86 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1064-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1064-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1064-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1064-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1064-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1064-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1064-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v9, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v9, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v8, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v8, 0 +; GFX1064-DPP-NEXT: v_max_f64 v[8:9], s[4:5], s[4:5] +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], s[2:3], s[2:3] +; GFX1064-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] @@ -7252,22 +9187,24 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1064-DPP-NEXT: .LBB11_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: @@ -7289,7 +9226,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] @@ -7303,28 +9239,78 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1032-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1032-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1032-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1032-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1032-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -7335,22 +9321,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1032-DPP-NEXT: .LBB11_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: @@ -7363,9 +9349,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -7376,18 +9361,89 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1164-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1164-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1164-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlane64_b32 v11, v9 +; GFX1164-DPP-NEXT: v_permlane64_b32 v10, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 -; GFX1164-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] @@ -7395,7 +9451,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -7404,24 +9460,24 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] ; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[2:3], off -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1164-DPP-NEXT: .LBB11_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; @@ -7436,7 +9492,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 @@ -7447,24 +9502,83 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 -; GFX1132-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -7473,23 +9587,23 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] ; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[2:3], off -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1132-DPP-NEXT: .LBB11_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() @@ -7680,6 +9794,39 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-NEXT: .LBB12_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX7LESS-DPP-NEXT: .LBB12_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8015,6 +10162,39 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-NEXT: .LBB13_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX7LESS-DPP-NEXT: .LBB13_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index 2bba8d4f43b1a..5ffa71d37164c 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -5,7 +5,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s -; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s +; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s @@ -217,6 +217,42 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: .LBB0_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[4:5] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX7LESS-DPP-NEXT: .LBB0_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec @@ -389,19 +425,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -413,27 +447,51 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX7LESS-NEXT: .LBB1_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 +; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB1_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB1_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: .LBB1_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_4 +; GFX7LESS-NEXT: .LBB1_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe: @@ -754,6 +812,56 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: .LBB1_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB1_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -1409,6 +1517,52 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: .LBB2_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7LESS-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX7LESS-DPP-NEXT: .LBB2_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 @@ -1621,19 +1775,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -1645,27 +1797,51 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX7LESS-NEXT: .LBB3_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 +; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB3_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB3_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_4 +; GFX7LESS-NEXT: .LBB3_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: @@ -1986,6 +2162,56 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1132-NEXT: .LBB3_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -2641,6 +2867,52 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: .LBB4_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7LESS-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX7LESS-DPP-NEXT: .LBB4_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 @@ -2853,19 +3125,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -2877,27 +3147,51 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX7LESS-NEXT: .LBB5_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 +; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB5_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB5_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: .LBB5_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_4 +; GFX7LESS-NEXT: .LBB5_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe: @@ -3218,6 +3512,56 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: .LBB5_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -3627,19 +3971,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -3651,27 +3993,51 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX7LESS-NEXT: .LBB6_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 +; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB6_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB6_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: .LBB6_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_4 +; GFX7LESS-NEXT: .LBB6_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: @@ -3992,6 +4358,56 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: .LBB6_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB6_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -4647,6 +5063,52 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1132-NEXT: .LBB7_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7LESS-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB7_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX7LESS-DPP-NEXT: .LBB7_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 @@ -4858,19 +5320,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_default_scope_strictfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -4882,27 +5342,51 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX7LESS-NEXT: .LBB8_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 +; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB8_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB8_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: .LBB8_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_4 +; GFX7LESS-NEXT: .LBB8_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_default_scope_strictfp: @@ -5223,6 +5707,56 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1132-NEXT: .LBB8_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_div_value_default_scope_strictfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -6077,6 +6611,86 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, s0, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s1, v3 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX7LESS-DPP-NEXT: v_cvt_f64_u32_e32 v[1:2], s2 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v4, v0, v4 +; GFX7LESS-DPP-NEXT: v_mul_f64 v[41:42], v[1:2], 4.0 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v4, v3 +; GFX7LESS-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], -v[41:42] +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX7LESS-DPP-NEXT: .LBB9_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 @@ -6465,9 +7079,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] @@ -6477,25 +7088,49 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v42, v0, v2 +; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX7LESS-NEXT: s_mov_b32 s12, s41 ; GFX7LESS-NEXT: s_mov_b32 s13, s40 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v41, v1 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v41, 0 +; GFX7LESS-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX7LESS-NEXT: .LBB10_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 +; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 +; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB10_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB10_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 ; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 -; GFX7LESS-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX7LESS-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[40:41] +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[41:42] ; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 @@ -6518,7 +7153,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b32 s12, s41 ; GFX7LESS-NEXT: s_mov_b32 s13, s40 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) @@ -6532,8 +7167,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_4 +; GFX7LESS-NEXT: .LBB10_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -6555,31 +7190,51 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s40, s7 ; GFX9-NEXT: s_mov_b32 s41, s6 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-NEXT: s_mov_b32 s12, s41 ; GFX9-NEXT: s_mov_b32 s13, s40 ; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: v_mov_b32_e32 v43, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: v_mov_b32_e32 v41, v1 -; GFX9-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_mov_b32_e32 v41, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX9-NEXT: .LBB10_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB10_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b64 s[44:45], 0 -; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX9-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] ; GFX9-NEXT: s_add_u32 s8, s36, 44 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] @@ -6596,7 +7251,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s12, s41 ; GFX9-NEXT: s_mov_b32 s13, s40 ; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -6614,8 +7269,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB10_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_cbranch_execnz .LBB10_4 +; GFX9-NEXT: .LBB10_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -6637,31 +7292,51 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s40, s7 ; GFX1064-NEXT: s_mov_b32 s41, s6 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-NEXT: s_mov_b32 s12, s41 ; GFX1064-NEXT: s_mov_b32 s13, s40 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_mov_b32_e32 v43, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: v_mov_b32_e32 v41, v1 -; GFX1064-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: v_mov_b32_e32 v41, 0 +; GFX1064-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB10_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] ; GFX1064-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-NEXT: s_getpc_b64 s[0:1] @@ -6670,7 +7345,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s42 @@ -6697,8 +7372,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_cbranch_execnz .LBB10_4 +; GFX1064-NEXT: .LBB10_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -6720,31 +7395,50 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s40, s7 ; GFX1032-NEXT: s_mov_b32 s41, s6 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-NEXT: s_mov_b32 s12, s41 ; GFX1032-NEXT: s_mov_b32 s13, s40 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: v_mov_b32_e32 v43, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: v_mov_b32_e32 v41, v1 -; GFX1032-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: v_mov_b32_e32 v41, 0 +; GFX1032-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s44, 0 -; GFX1032-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB10_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1032-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] ; GFX1032-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-NEXT: s_getpc_b64 s[0:1] @@ -6753,7 +7447,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: v_mov_b32_e32 v2, s42 @@ -6780,8 +7474,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_cbranch_execnz .LBB10_4 +; GFX1032-NEXT: .LBB10_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -6794,9 +7488,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -6804,29 +7497,53 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: s_mov_b32 s32, 32 -; GFX1164-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: s_mov_b32 s40, s7 ; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: v_mov_b32_e32 v43, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: v_mov_b32_e32 v41, v1 -; GFX1164-NEXT: global_load_b64 v[1:2], v43, s[42:43] -; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: v_mov_b32_e32 v41, 0 +; GFX1164-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB10_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 -; GFX1164-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1164-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] ; GFX1164-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 @@ -6852,8 +7569,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_cbranch_execnz .LBB10_4 +; GFX1164-NEXT: .LBB10_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; @@ -6868,7 +7585,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-NEXT: s_mov_b32 s40, s14 ; GFX1132-NEXT: s_mov_b32 s41, s13 @@ -6879,24 +7595,49 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 -; GFX1132-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-NEXT: v_mov_b32_e32 v41, 0 +; GFX1132-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB10_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 -; GFX1132-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1132-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 @@ -6918,11 +7659,97 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_cbranch_execnz .LBB10_4 +; GFX1132-NEXT: .LBB10_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v42, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], -v[40:41] +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 @@ -6942,31 +7769,86 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s40, s7 ; GFX9-DPP-NEXT: s_mov_b32 s41, s6 ; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-DPP-NEXT: s_mov_b32 s12, s41 ; GFX9-DPP-NEXT: s_mov_b32 s13, s40 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX9-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s43, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s42, v8, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -s[42:43] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] @@ -6983,12 +7865,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s12, s41 ; GFX9-DPP-NEXT: s_mov_b32 s13, s40 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -6999,10 +7881,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX9-DPP-NEXT: .LBB10_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -7024,31 +7906,78 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v9, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v8, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v9, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v8, 32 +; GFX1064-DPP-NEXT: v_add_f64 v[8:9], s[2:3], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] @@ -7057,7 +7986,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 @@ -7084,8 +8013,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1064-DPP-NEXT: .LBB10_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -7107,40 +8036,81 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1032-DPP-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] -; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 @@ -7167,8 +8137,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1032-DPP-NEXT: .LBB10_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -7181,9 +8151,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -7191,29 +8160,87 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_permlane64_b32 v11, v9 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v10, v8 +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 -; GFX1164-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 @@ -7239,8 +8266,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1164-DPP-NEXT: .LBB10_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; @@ -7255,7 +8282,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 @@ -7266,24 +8292,74 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 -; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 -; GFX1132-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -7305,8 +8381,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1132-DPP-NEXT: .LBB10_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.float.value() @@ -7572,6 +8648,55 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1132-NEXT: .LBB11_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7LESS-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX7LESS-DPP-NEXT: .LBB11_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 @@ -7786,19 +8911,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -7810,30 +8933,56 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX7LESS-NEXT: .LBB12_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 +; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 +; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB12_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB12_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: .LBB12_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_4 +; GFX7LESS-NEXT: .LBB12_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: @@ -7844,10 +8993,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -7862,24 +9011,47 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-NEXT: .LBB12_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB12_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB12_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX9-NEXT: .LBB12_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB12_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB12_4 +; GFX9-NEXT: .LBB12_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: @@ -7889,43 +9061,66 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB12_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB12_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB12_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1064-NEXT: .LBB12_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB12_4 +; GFX1064-NEXT: .LBB12_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: @@ -7935,117 +9130,245 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB12_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB12_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB12_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1032-NEXT: .LBB12_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB12_4 +; GFX1032-NEXT: .LBB12_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB12_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB12_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1164-NEXT: .LBB12_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB12_4 +; GFX1164-NEXT: .LBB12_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB12_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB12_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB12_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1132-NEXT: .LBB12_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB12_4 +; GFX1132-NEXT: .LBB12_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -8054,10 +9377,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -8072,24 +9395,83 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-DPP-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] +; GFX9-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -s[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX9-DPP-NEXT: .LBB12_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: @@ -8099,43 +9481,93 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1064-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1064-DPP-NEXT: .LBB12_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: @@ -8145,115 +9577,271 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1032-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1032-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1032-DPP-NEXT: .LBB12_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1164-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1164-DPP-NEXT: .LBB12_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1132-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1132-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1132-DPP-NEXT: .LBB12_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() strictfp %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic @@ -8518,6 +10106,55 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-NEXT: .LBB13_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7LESS-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX7LESS-DPP-NEXT: .LBB13_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 @@ -8733,19 +10370,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -8757,30 +10392,56 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX7LESS-NEXT: .LBB14_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 +; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 +; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB14_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB14_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: .LBB14_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB14_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB14_4 +; GFX7LESS-NEXT: .LBB14_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: @@ -8791,10 +10452,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -8809,24 +10470,47 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-NEXT: .LBB14_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB14_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX9-NEXT: .LBB14_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB14_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB14_4 +; GFX9-NEXT: .LBB14_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: @@ -8836,43 +10520,66 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB14_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1064-NEXT: .LBB14_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB14_4 +; GFX1064-NEXT: .LBB14_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: @@ -8882,117 +10589,245 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB14_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1032-NEXT: .LBB14_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB14_4 +; GFX1032-NEXT: .LBB14_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB14_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1164-NEXT: .LBB14_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB14_4 +; GFX1164-NEXT: .LBB14_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB14_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1132-NEXT: .LBB14_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB14_4 +; GFX1132-NEXT: .LBB14_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -9001,10 +10836,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -9019,24 +10854,83 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] +; GFX9-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -s[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB14_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX9-DPP-NEXT: .LBB14_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: @@ -9046,43 +10940,93 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1064-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX1064-DPP-NEXT: .LBB14_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: @@ -9092,115 +11036,271 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1032-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1032-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX1032-DPP-NEXT: .LBB14_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1164-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX1164-DPP-NEXT: .LBB14_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1132-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1132-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX1132-DPP-NEXT: .LBB14_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic @@ -9211,19 +11311,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -9235,30 +11333,56 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX7LESS-NEXT: .LBB15_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 +; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 +; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB15_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB15_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: .LBB15_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB15_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB15_4 +; GFX7LESS-NEXT: .LBB15_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: @@ -9269,10 +11393,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -9287,24 +11411,47 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-NEXT: .LBB15_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB15_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX9-NEXT: .LBB15_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB15_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB15_4 +; GFX9-NEXT: .LBB15_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: @@ -9314,43 +11461,66 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB15_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1064-NEXT: .LBB15_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB15_4 +; GFX1064-NEXT: .LBB15_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: @@ -9360,117 +11530,245 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB15_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1032-NEXT: .LBB15_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB15_4 +; GFX1032-NEXT: .LBB15_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB15_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1164-NEXT: .LBB15_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB15_4 +; GFX1164-NEXT: .LBB15_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB15_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1132-NEXT: .LBB15_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB15_4 +; GFX1132-NEXT: .LBB15_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -9479,10 +11777,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -9497,24 +11795,83 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] +; GFX9-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -s[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB15_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX9-DPP-NEXT: .LBB15_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: @@ -9524,43 +11881,93 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1064-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1064-DPP-NEXT: .LBB15_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: @@ -9570,115 +11977,271 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1032-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1032-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1032-DPP-NEXT: .LBB15_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1164-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1164-DPP-NEXT: .LBB15_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1132-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1132-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1132-DPP-NEXT: .LBB15_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.float.value() strictfp %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic @@ -10150,6 +12713,88 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v5, exec_lo, 0 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX7LESS-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0xc3300000 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v5, exec_hi, v5 +; GFX7LESS-DPP-NEXT: v_add_f64 v[3:4], s[0:1], v[3:4] +; GFX7LESS-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v3, v2 +; GFX7LESS-DPP-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], -v[41:42] +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX7LESS-DPP-NEXT: .LBB16_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 @@ -10552,9 +13197,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] @@ -10564,25 +13206,49 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v42, v0, v2 +; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX7LESS-NEXT: s_mov_b32 s12, s41 ; GFX7LESS-NEXT: s_mov_b32 s13, s40 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v41, v1 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v41, 0 +; GFX7LESS-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX7LESS-NEXT: .LBB17_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 +; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 +; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB17_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB17_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 ; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 -; GFX7LESS-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX7LESS-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[40:41] +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[41:42] ; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 @@ -10605,7 +13271,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_mov_b32 s12, s41 ; GFX7LESS-NEXT: s_mov_b32 s13, s40 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) @@ -10619,8 +13285,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_4 +; GFX7LESS-NEXT: .LBB17_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: @@ -10642,31 +13308,51 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-NEXT: s_mov_b32 s40, s7 ; GFX9-NEXT: s_mov_b32 s41, s6 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-NEXT: s_mov_b32 s12, s41 ; GFX9-NEXT: s_mov_b32 s13, s40 ; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: v_mov_b32_e32 v43, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: v_mov_b32_e32 v41, v1 -; GFX9-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_mov_b32_e32 v41, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX9-NEXT: .LBB17_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB17_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b64 s[44:45], 0 -; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX9-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] ; GFX9-NEXT: s_add_u32 s8, s36, 44 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] @@ -10683,7 +13369,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-NEXT: s_mov_b32 s12, s41 ; GFX9-NEXT: s_mov_b32 s13, s40 ; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -10701,8 +13387,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB17_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_cbranch_execnz .LBB17_4 +; GFX9-NEXT: .LBB17_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: @@ -10724,31 +13410,51 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b32 s40, s7 ; GFX1064-NEXT: s_mov_b32 s41, s6 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-NEXT: s_mov_b32 s12, s41 ; GFX1064-NEXT: s_mov_b32 s13, s40 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_mov_b32_e32 v43, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: v_mov_b32_e32 v41, v1 -; GFX1064-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: v_mov_b32_e32 v41, 0 +; GFX1064-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB17_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] ; GFX1064-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-NEXT: s_getpc_b64 s[0:1] @@ -10757,7 +13463,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s42 @@ -10784,8 +13490,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_cbranch_execnz .LBB17_4 +; GFX1064-NEXT: .LBB17_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: @@ -10807,31 +13513,50 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-NEXT: s_mov_b32 s40, s7 ; GFX1032-NEXT: s_mov_b32 s41, s6 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-NEXT: s_mov_b32 s12, s41 ; GFX1032-NEXT: s_mov_b32 s13, s40 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: v_mov_b32_e32 v43, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: v_mov_b32_e32 v41, v1 -; GFX1032-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: v_mov_b32_e32 v41, 0 +; GFX1032-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s44, 0 -; GFX1032-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB17_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1032-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] ; GFX1032-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-NEXT: s_getpc_b64 s[0:1] @@ -10840,7 +13565,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: v_mov_b32_e32 v2, s42 @@ -10867,8 +13592,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_cbranch_execnz .LBB17_4 +; GFX1032-NEXT: .LBB17_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: @@ -10881,9 +13606,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -10891,29 +13615,53 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: s_mov_b32 s32, 32 -; GFX1164-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: s_mov_b32 s40, s7 ; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: v_mov_b32_e32 v43, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: v_mov_b32_e32 v41, v1 -; GFX1164-NEXT: global_load_b64 v[1:2], v43, s[42:43] -; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: v_mov_b32_e32 v41, 0 +; GFX1164-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB17_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 -; GFX1164-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1164-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] ; GFX1164-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 @@ -10939,8 +13687,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_cbranch_execnz .LBB17_4 +; GFX1164-NEXT: .LBB17_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; @@ -10955,7 +13703,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-NEXT: s_mov_b32 s40, s14 ; GFX1132-NEXT: s_mov_b32 s41, s13 @@ -10966,24 +13713,49 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 -; GFX1132-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-NEXT: v_mov_b32_e32 v41, 0 +; GFX1132-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB17_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 -; GFX1132-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1132-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 @@ -11005,11 +13777,97 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_cbranch_execnz .LBB17_4 +; GFX1132-NEXT: .LBB17_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v42, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], -v[40:41] +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 @@ -11029,31 +13887,86 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_mov_b32 s40, s7 ; GFX9-DPP-NEXT: s_mov_b32 s41, s6 ; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-DPP-NEXT: s_mov_b32 s12, s41 ; GFX9-DPP-NEXT: s_mov_b32 s13, s40 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX9-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s43, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s42, v8, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX9-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -s[42:43] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] @@ -11070,12 +13983,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_mov_b32 s12, s41 ; GFX9-DPP-NEXT: s_mov_b32 s13, s40 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -11086,10 +13999,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX9-DPP-NEXT: .LBB17_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: @@ -11111,31 +14024,78 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v9, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v8, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v9, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v8, 32 +; GFX1064-DPP-NEXT: v_add_f64 v[8:9], s[2:3], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] @@ -11144,7 +14104,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 @@ -11171,8 +14131,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX1064-DPP-NEXT: .LBB17_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: @@ -11194,31 +14154,72 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1032-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1032-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] @@ -11227,7 +14228,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 @@ -11254,8 +14255,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX1032-DPP-NEXT: .LBB17_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: @@ -11268,9 +14269,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -11278,29 +14278,87 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_permlane64_b32 v11, v9 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v10, v8 +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 -; GFX1164-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 @@ -11326,8 +14384,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX1164-DPP-NEXT: .LBB17_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; @@ -11342,7 +14400,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 @@ -11353,24 +14410,74 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 -; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 -; GFX1132-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -11392,8 +14499,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX1132-DPP-NEXT: .LBB17_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.float.value() strictfp diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index d8a790c718408..31783968d2b28 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -7613,9 +7613,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB28_2 -; GFX7-NEXT: ; %bb.3: ; %Flow18 +; GFX7-NEXT: ; %bb.3: ; %Flow22 ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: .LBB28_4: ; %Flow19 +; GFX7-NEXT: .LBB28_4: ; %Flow23 ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_mov_b64 s[8:9], exec ; GFX7-NEXT: v_readfirstlane_b32 s10, v1 @@ -7643,32 +7643,64 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB28_6 -; GFX7-NEXT: .LBB28_7: ; %Flow17 +; GFX7-NEXT: .LBB28_7: ; %Flow21 ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v0 -; GFX7-NEXT: ds_read_b32 v0, v1 -; GFX7-NEXT: v_add_f32_e32 v2, s10, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, s10 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: .LBB28_8: ; %atomicrmw.start8 +; GFX7-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 +; GFX7-NEXT: v_add_f32_e32 v0, s10, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s10 +; GFX7-NEXT: s_mov_b64 s[0:1], exec +; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; GFX7-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX7-NEXT: ; implicit-def: $vgpr0 +; GFX7-NEXT: .LBB28_8: ; %ComputeLoop +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX7-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX7-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX7-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7-NEXT: v_readfirstlane_b32 s8, v1 +; GFX7-NEXT: v_readlane_b32 s9, v2, s5 +; GFX7-NEXT: s_mov_b32 m0, s5 +; GFX7-NEXT: v_writelane_b32 v0, s8, m0 +; GFX7-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX7-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7-NEXT: s_cbranch_vccnz .LBB28_8 +; GFX7-NEXT: ; %bb.9: ; %ComputeEnd +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v2, exec_hi, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7-NEXT: ; implicit-def: $vgpr2 +; GFX7-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7-NEXT: s_xor_b64 s[6:7], exec, s[0:1] +; GFX7-NEXT: s_cbranch_execz .LBB28_13 +; GFX7-NEXT: ; %bb.10: +; GFX7-NEXT: v_mov_b32_e32 v3, s4 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b32 v2, v3 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB28_11: ; %atomicrmw.start8 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, v0 -; GFX7-NEXT: v_add_f32_e32 v0, v3, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v0, v1, v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_add_f32_e32 v2, v4, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v3, v4, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7-NEXT: s_cbranch_execnz .LBB28_8 -; GFX7-NEXT: ; %bb.9: ; %atomicrmw.end7 -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 +; GFX7-NEXT: s_or_b64 s[4:5], s[0:1], s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB28_11 +; GFX7-NEXT: ; %bb.12: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: .LBB28_13: ; %Flow19 +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX7-NEXT: v_readfirstlane_b32 s4, v2 +; GFX7-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -7705,9 +7737,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB28_2 -; GFX6-NEXT: ; %bb.3: ; %Flow16 +; GFX6-NEXT: ; %bb.3: ; %Flow20 ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: .LBB28_4: ; %Flow17 +; GFX6-NEXT: .LBB28_4: ; %Flow21 ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_mov_b64 s[8:9], exec ; GFX6-NEXT: v_readfirstlane_b32 s10, v1 @@ -7735,32 +7767,64 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB28_6 -; GFX6-NEXT: .LBB28_7: ; %Flow15 +; GFX6-NEXT: .LBB28_7: ; %Flow19 ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v0 -; GFX6-NEXT: ds_read_b32 v0, v1 -; GFX6-NEXT: v_add_f32_e32 v2, s10, v2 -; GFX6-NEXT: v_mov_b32_e32 v3, s10 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX6-NEXT: s_mov_b64 s[0:1], 0 -; GFX6-NEXT: .LBB28_8: ; %atomicrmw.start8 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 +; GFX6-NEXT: v_add_f32_e32 v0, s10, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s10 +; GFX6-NEXT: s_mov_b64 s[0:1], exec +; GFX6-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX6-NEXT: ; implicit-def: $vgpr0 +; GFX6-NEXT: .LBB28_8: ; %ComputeLoop +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX6-NEXT: v_readfirstlane_b32 s8, v1 +; GFX6-NEXT: v_readlane_b32 s9, v2, s5 +; GFX6-NEXT: s_mov_b32 m0, s5 +; GFX6-NEXT: v_writelane_b32 v0, s8, m0 +; GFX6-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX6-NEXT: s_cbranch_vccnz .LBB28_8 +; GFX6-NEXT: ; %bb.9: ; %ComputeEnd +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v2, exec_hi, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX6-NEXT: ; implicit-def: $vgpr2 +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_xor_b64 s[6:7], exec, s[0:1] +; GFX6-NEXT: s_cbranch_execz .LBB28_13 +; GFX6-NEXT: ; %bb.10: +; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: ds_read_b32 v2, v3 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: .LBB28_11: ; %atomicrmw.start8 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v3, v0 -; GFX6-NEXT: v_add_f32_e32 v0, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v3, v0 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_add_f32_e32 v2, v4, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v3, v4, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; GFX6-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_cbranch_execnz .LBB28_8 -; GFX6-NEXT: ; %bb.9: ; %atomicrmw.end7 -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 +; GFX6-NEXT: s_or_b64 s[4:5], s[0:1], s[4:5] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB28_11 +; GFX6-NEXT: ; %bb.12: ; %Flow +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: .LBB28_13: ; %Flow17 +; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: v_readfirstlane_b32 s4, v2 +; GFX6-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -8389,9 +8453,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB29_2 -; GFX7-NEXT: ; %bb.3: ; %Flow18 +; GFX7-NEXT: ; %bb.3: ; %Flow22 ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: .LBB29_4: ; %Flow19 +; GFX7-NEXT: .LBB29_4: ; %Flow23 ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_mov_b64 s[8:9], exec ; GFX7-NEXT: v_readfirstlane_b32 s10, v1 @@ -8419,32 +8483,64 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB29_6 -; GFX7-NEXT: .LBB29_7: ; %Flow17 +; GFX7-NEXT: .LBB29_7: ; %Flow21 ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v0 -; GFX7-NEXT: ds_read_b32 v0, v1 -; GFX7-NEXT: v_add_f32_e32 v2, s10, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, s10 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: .LBB29_8: ; %atomicrmw.start8 +; GFX7-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 +; GFX7-NEXT: v_add_f32_e32 v0, s10, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s10 +; GFX7-NEXT: s_mov_b64 s[0:1], exec +; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; GFX7-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX7-NEXT: ; implicit-def: $vgpr0 +; GFX7-NEXT: .LBB29_8: ; %ComputeLoop +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX7-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX7-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX7-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7-NEXT: v_readfirstlane_b32 s8, v1 +; GFX7-NEXT: v_readlane_b32 s9, v2, s5 +; GFX7-NEXT: s_mov_b32 m0, s5 +; GFX7-NEXT: v_writelane_b32 v0, s8, m0 +; GFX7-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX7-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7-NEXT: s_cbranch_vccnz .LBB29_8 +; GFX7-NEXT: ; %bb.9: ; %ComputeEnd +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v2, exec_hi, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7-NEXT: ; implicit-def: $vgpr2 +; GFX7-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7-NEXT: s_xor_b64 s[6:7], exec, s[0:1] +; GFX7-NEXT: s_cbranch_execz .LBB29_13 +; GFX7-NEXT: ; %bb.10: +; GFX7-NEXT: v_mov_b32_e32 v3, s4 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b32 v2, v3 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB29_11: ; %atomicrmw.start8 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, v0 -; GFX7-NEXT: v_add_f32_e32 v0, v3, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v0, v1, v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_add_f32_e32 v2, v4, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v3, v4, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7-NEXT: s_cbranch_execnz .LBB29_8 -; GFX7-NEXT: ; %bb.9: ; %atomicrmw.end7 -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 +; GFX7-NEXT: s_or_b64 s[4:5], s[0:1], s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB29_11 +; GFX7-NEXT: ; %bb.12: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: .LBB29_13: ; %Flow19 +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX7-NEXT: v_readfirstlane_b32 s4, v2 +; GFX7-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -8481,9 +8577,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB29_2 -; GFX6-NEXT: ; %bb.3: ; %Flow16 +; GFX6-NEXT: ; %bb.3: ; %Flow20 ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: .LBB29_4: ; %Flow17 +; GFX6-NEXT: .LBB29_4: ; %Flow21 ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_mov_b64 s[8:9], exec ; GFX6-NEXT: v_readfirstlane_b32 s10, v1 @@ -8511,32 +8607,64 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB29_6 -; GFX6-NEXT: .LBB29_7: ; %Flow15 +; GFX6-NEXT: .LBB29_7: ; %Flow19 ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v0 -; GFX6-NEXT: ds_read_b32 v0, v1 -; GFX6-NEXT: v_add_f32_e32 v2, s10, v2 -; GFX6-NEXT: v_mov_b32_e32 v3, s10 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX6-NEXT: s_mov_b64 s[0:1], 0 -; GFX6-NEXT: .LBB29_8: ; %atomicrmw.start8 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 +; GFX6-NEXT: v_add_f32_e32 v0, s10, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s10 +; GFX6-NEXT: s_mov_b64 s[0:1], exec +; GFX6-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX6-NEXT: ; implicit-def: $vgpr0 +; GFX6-NEXT: .LBB29_8: ; %ComputeLoop +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX6-NEXT: v_readfirstlane_b32 s8, v1 +; GFX6-NEXT: v_readlane_b32 s9, v2, s5 +; GFX6-NEXT: s_mov_b32 m0, s5 +; GFX6-NEXT: v_writelane_b32 v0, s8, m0 +; GFX6-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX6-NEXT: s_cbranch_vccnz .LBB29_8 +; GFX6-NEXT: ; %bb.9: ; %ComputeEnd +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v2, exec_hi, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX6-NEXT: ; implicit-def: $vgpr2 +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_xor_b64 s[6:7], exec, s[0:1] +; GFX6-NEXT: s_cbranch_execz .LBB29_13 +; GFX6-NEXT: ; %bb.10: +; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: ds_read_b32 v2, v3 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: .LBB29_11: ; %atomicrmw.start8 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v3, v0 -; GFX6-NEXT: v_add_f32_e32 v0, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v3, v0 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_add_f32_e32 v2, v4, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v3, v4, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; GFX6-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_cbranch_execnz .LBB29_8 -; GFX6-NEXT: ; %bb.9: ; %atomicrmw.end7 -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 +; GFX6-NEXT: s_or_b64 s[4:5], s[0:1], s[4:5] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB29_11 +; GFX6-NEXT: ; %bb.12: ; %Flow +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: .LBB29_13: ; %Flow17 +; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: v_readfirstlane_b32 s4, v2 +; GFX6-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm