llvm
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
Lines changed: 17 additions & 5 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
Lines changed: 17 additions & 5 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll
Lines changed: 1158 additions & 188 deletions b/‎llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll
Lines changed: 1158 additions & 188 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
Lines changed: 872 additions & 166 deletions b/‎llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
Lines changed: 872 additions & 166 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
Lines changed: 564 additions & 74 deletions b/‎llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
Lines changed: 564 additions & 74 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll
Lines changed: 1138 additions & 194 deletions b/‎llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll
Lines changed: 1138 additions & 194 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
Lines changed: 486 additions & 18 deletions b/‎llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
Lines changed: 486 additions & 18 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
Lines changed: 414 additions & 18 deletions b/‎llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
Lines changed: 414 additions & 18 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
Lines changed: 2992 additions & 1062 deletions b/‎llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
Lines changed: 2992 additions & 1062 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
Lines changed: 1894 additions & 579 deletions b/‎llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
Lines changed: 1894 additions & 579 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
Lines changed: 1894 additions & 579 deletions b/‎llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
Lines changed: 1894 additions & 579 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
Lines changed: 2993 additions & 1063 deletions b/‎llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
Lines changed: 2993 additions & 1063 deletions
@@ -178,6 +178,20 @@ bool AMDGPUAtomicOptimizerImpl::run(Function &F) {
   return Changed;
 }
 
+static bool shouldOptimizeForType(Type *Ty) {
+  switch (Ty->getTypeID()) {
+  case Type::FloatTyID:
+  case Type::DoubleTyID:
+    return true;
+  case Type::IntegerTyID: {
+    if (Ty->getIntegerBitWidth() == 32 || Ty->getIntegerBitWidth() == 64)
+      return true;
+  default:
+    return false;
+  }
+  }
+}
+
 void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
   // Early exit for unhandled address space atomic instructions.
   switch (I.getPointerAddressSpace()) {
@@ -230,8 +244,7 @@ void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
   // value to the atomic calculation. We can only optimize divergent values if
   // we have DPP available on our subtarget, and the atomic operation is 32
   // bits.
-  if (ValDivergent &&
-      (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) {
+  if (ValDivergent && (!ST->hasDPP() || !shouldOptimizeForType(I.getType()))) {
     return;
   }
 
@@ -313,8 +326,7 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
   // value to the atomic calculation. We can only optimize divergent values if
   // we have DPP available on our subtarget, and the atomic operation is 32
   // bits.
-  if (ValDivergent &&
-      (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) {
+  if (ValDivergent && (!ST->hasDPP() || !shouldOptimizeForType(I.getType()))) {
     return;
   }
 
@@ -745,7 +757,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
         // of each active lane in the wavefront. This will be our new value
         // which we will provide to the atomic operation.
         Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
-        assert(TyBitWidth == 32);
+        assert(TyBitWidth == 32 || TyBitWidth == 64);
         NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane,
                                  {NewV, LastLaneIdx});
       }