[LV] Honor forced scalars in setVectorizedCallDecision.

fhahn · fhahn · commit 3bd161e98d89 · 2024-09-03T21:06:32.000+01:00
Similarly to dd94537, setVectorizedCallDecision also did not consider ForcedScalars. This lead to VPlans not reflecting the decision by the legacy cost model (cost computation would use scalar cost, VPlan would have VPWidenCallRecipe). To fix this, check if the call has been forced to scalar in setVectorizedCallDecision. Note that this requires moving setVectorizedCallDecision after collectLoopUniforms (which sets ForcedScalars). collectLoopUniforms does not depend on call decisions and can safely be moved. Fixes #107051.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1290,8 +1290,8 @@ class LoopVectorizationCostModel {
     if (VF.isScalar() || Uniforms.contains(VF))
       return;
     setCostBasedWideningDecision(VF);
-    setVectorizedCallDecision(VF);
     collectLoopUniforms(VF);
+    setVectorizedCallDecision(VF);
     collectLoopScalars(VF);
   }
 
@@ -6194,6 +6194,7 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
   assert(!VF.isScalar() &&
          "Trying to set a vectorization decision for a scalar VF");
 
+  auto ForcedScalar = ForcedScalars.find(VF);
   for (BasicBlock *BB : TheLoop->blocks()) {
     // For each instruction in the old loop.
     for (Instruction &I : *BB) {
@@ -6206,14 +6207,37 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
       InstructionCost VectorCost = InstructionCost::getInvalid();
       InstructionCost IntrinsicCost = InstructionCost::getInvalid();
       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-
       Function *ScalarFunc = CI->getCalledFunction();
       Type *ScalarRetTy = CI->getType();
       SmallVector<Type *, 4> Tys, ScalarTys;
-      bool MaskRequired = Legal->isMaskRequired(CI);
       for (auto &ArgOp : CI->args())
         ScalarTys.push_back(ArgOp->getType());
 
+      // Estimate cost of scalarized vector call. The source operands are
+      // assumed to be vectors, so we need to extract individual elements from
+      // there, execute VF scalar calls, and then gather the result into the
+      // vector return value.
+      InstructionCost ScalarCallCost =
+          TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
+
+      // Compute costs of unpacking argument values for the scalar calls and
+      // packing the return values to a vector.
+      InstructionCost ScalarizationCost =
+          getScalarizationOverhead(CI, VF, CostKind);
+
+      ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
+      // Honor ForcedScalars decision.
+      // TODO: For calls, it might still be more profitable to widen. Use
+      // VPlan-based cost model to compare different options.
+      if (VF.isVector() && ForcedScalar != ForcedScalars.end() &&
+          ForcedScalar->second.contains(CI)) {
+        setCallWideningDecision(CI, VF, CM_Scalarize, nullptr,
+                                Intrinsic::not_intrinsic, std::nullopt,
+                                ScalarCost);
+        continue;
+      }
+
+      bool MaskRequired = Legal->isMaskRequired(CI);
       // Compute corresponding vector type for return value and arguments.
       Type *RetTy = ToVectorTy(ScalarRetTy, VF);
       for (Type *ScalarTy : ScalarTys)
@@ -6229,20 +6253,6 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
           continue;
         }
 
-      // Estimate cost of scalarized vector call. The source operands are
-      // assumed to be vectors, so we need to extract individual elements from
-      // there, execute VF scalar calls, and then gather the result into the
-      // vector return value.
-      InstructionCost ScalarCallCost =
-          TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
-
-      // Compute costs of unpacking argument values for the scalar calls and
-      // packing the return values to a vector.
-      InstructionCost ScalarizationCost =
-          getScalarizationOverhead(CI, VF, CostKind);
-
-      ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
-
       // Find the cost of vectorizing the call, if we can find a suitable
       // vector variant of the function.
       bool UsesMask = false;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
@@ -178,7 +178,57 @@ exit:
   ret void
 }
 
-declare double @llvm.sqrt.f64(double) #0
+define void @call_forced_scalar(ptr %src.1, ptr %src.2, ptr noalias %dst.1, ptr noalias %dst.2) {
+; CHECK-LABEL: define void @call_forced_scalar(
+; CHECK-SAME: ptr [[SRC_1:%.*]], ptr [[SRC_2:%.*]], ptr noalias [[DST_1:%.*]], ptr noalias [[DST_2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC_1]], align 4
+; CHECK-NEXT:    [[SMAX:%.*]] = tail call i32 @llvm.smax.i32(i32 [[TMP0]], i32 0)
+; CHECK-NEXT:    [[UMIN:%.*]] = tail call i32 @llvm.umin.i32(i32 [[SMAX]], i32 1)
+; CHECK-NEXT:    [[UMIN_EXT:%.*]] = zext i32 [[UMIN]] to i64
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[UMIN_EXT]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[GEP_SRC_2]], align 1
+; CHECK-NEXT:    [[L_EXT:%.*]] = zext i8 [[TMP1]] to i32
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 3, [[L_EXT]]
+; CHECK-NEXT:    store i32 [[MUL]], ptr [[DST_1]], align 4
+; CHECK-NEXT:    [[GEP_DST_2:%.*]] = getelementptr i32, ptr [[DST_2]], i64 [[IV]]
+; CHECK-NEXT:    store i32 0, ptr [[GEP_DST_2]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %0 = load i32, ptr %src.1, align 4
+  %smax = tail call i32 @llvm.smax.i32(i32 %0, i32 0)
+  %umin = tail call i32 @llvm.umin.i32(i32 %smax, i32 1)
+  %umin.ext = zext i32 %umin to i64
+  %gep.src.2 = getelementptr i8, ptr %src.2, i64 %umin.ext
+  %1 = load i8, ptr %gep.src.2, align 1
+  %l.ext = zext i8 %1 to i32
+  %mul = mul i32 3, %l.ext
+  store i32 %mul, ptr %dst.1, align 4
+  %gep.dst.2 = getelementptr i32, ptr %dst.2, i64 %iv
+  store i32 0, ptr %gep.dst.2, align 4
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 0
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+declare i32 @llvm.smax.i32(i32, i32)
+declare i32 @llvm.umin.i32(i32, i32)
+declare double @llvm.sqrt.f64(double)
 declare double @llvm.powi.f64.i32(double, i32)
 declare i64 @llvm.fshl.i64(i64, i64, i64)
 ;.