Skip to content

Commit 3bd161e

Browse files
committed
[LV] Honor forced scalars in setVectorizedCallDecision.
Similarly to dd94537, setVectorizedCallDecision also did not consider ForcedScalars. This lead to VPlans not reflecting the decision by the legacy cost model (cost computation would use scalar cost, VPlan would have VPWidenCallRecipe). To fix this, check if the call has been forced to scalar in setVectorizedCallDecision. Note that this requires moving setVectorizedCallDecision after collectLoopUniforms (which sets ForcedScalars). collectLoopUniforms does not depend on call decisions and can safely be moved. Fixes #107051.
1 parent fcb7b39 commit 3bd161e

File tree

2 files changed

+78
-18
lines changed

2 files changed

+78
-18
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 27 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1290,8 +1290,8 @@ class LoopVectorizationCostModel {
12901290
if (VF.isScalar() || Uniforms.contains(VF))
12911291
return;
12921292
setCostBasedWideningDecision(VF);
1293-
setVectorizedCallDecision(VF);
12941293
collectLoopUniforms(VF);
1294+
setVectorizedCallDecision(VF);
12951295
collectLoopScalars(VF);
12961296
}
12971297

@@ -6194,6 +6194,7 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
61946194
assert(!VF.isScalar() &&
61956195
"Trying to set a vectorization decision for a scalar VF");
61966196

6197+
auto ForcedScalar = ForcedScalars.find(VF);
61976198
for (BasicBlock *BB : TheLoop->blocks()) {
61986199
// For each instruction in the old loop.
61996200
for (Instruction &I : *BB) {
@@ -6206,14 +6207,37 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
62066207
InstructionCost VectorCost = InstructionCost::getInvalid();
62076208
InstructionCost IntrinsicCost = InstructionCost::getInvalid();
62086209
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6209-
62106210
Function *ScalarFunc = CI->getCalledFunction();
62116211
Type *ScalarRetTy = CI->getType();
62126212
SmallVector<Type *, 4> Tys, ScalarTys;
6213-
bool MaskRequired = Legal->isMaskRequired(CI);
62146213
for (auto &ArgOp : CI->args())
62156214
ScalarTys.push_back(ArgOp->getType());
62166215

6216+
// Estimate cost of scalarized vector call. The source operands are
6217+
// assumed to be vectors, so we need to extract individual elements from
6218+
// there, execute VF scalar calls, and then gather the result into the
6219+
// vector return value.
6220+
InstructionCost ScalarCallCost =
6221+
TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
6222+
6223+
// Compute costs of unpacking argument values for the scalar calls and
6224+
// packing the return values to a vector.
6225+
InstructionCost ScalarizationCost =
6226+
getScalarizationOverhead(CI, VF, CostKind);
6227+
6228+
ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6229+
// Honor ForcedScalars decision.
6230+
// TODO: For calls, it might still be more profitable to widen. Use
6231+
// VPlan-based cost model to compare different options.
6232+
if (VF.isVector() && ForcedScalar != ForcedScalars.end() &&
6233+
ForcedScalar->second.contains(CI)) {
6234+
setCallWideningDecision(CI, VF, CM_Scalarize, nullptr,
6235+
Intrinsic::not_intrinsic, std::nullopt,
6236+
ScalarCost);
6237+
continue;
6238+
}
6239+
6240+
bool MaskRequired = Legal->isMaskRequired(CI);
62176241
// Compute corresponding vector type for return value and arguments.
62186242
Type *RetTy = ToVectorTy(ScalarRetTy, VF);
62196243
for (Type *ScalarTy : ScalarTys)
@@ -6229,20 +6253,6 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
62296253
continue;
62306254
}
62316255

6232-
// Estimate cost of scalarized vector call. The source operands are
6233-
// assumed to be vectors, so we need to extract individual elements from
6234-
// there, execute VF scalar calls, and then gather the result into the
6235-
// vector return value.
6236-
InstructionCost ScalarCallCost =
6237-
TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
6238-
6239-
// Compute costs of unpacking argument values for the scalar calls and
6240-
// packing the return values to a vector.
6241-
InstructionCost ScalarizationCost =
6242-
getScalarizationOverhead(CI, VF, CostKind);
6243-
6244-
ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6245-
62466256
// Find the cost of vectorizing the call, if we can find a suitable
62476257
// vector variant of the function.
62486258
bool UsesMask = false;

llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,57 @@ exit:
178178
ret void
179179
}
180180

181-
declare double @llvm.sqrt.f64(double) #0
181+
define void @call_forced_scalar(ptr %src.1, ptr %src.2, ptr noalias %dst.1, ptr noalias %dst.2) {
182+
; CHECK-LABEL: define void @call_forced_scalar(
183+
; CHECK-SAME: ptr [[SRC_1:%.*]], ptr [[SRC_2:%.*]], ptr noalias [[DST_1:%.*]], ptr noalias [[DST_2:%.*]]) {
184+
; CHECK-NEXT: [[ENTRY:.*]]:
185+
; CHECK-NEXT: br label %[[LOOP:.*]]
186+
; CHECK: [[LOOP]]:
187+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
188+
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC_1]], align 4
189+
; CHECK-NEXT: [[SMAX:%.*]] = tail call i32 @llvm.smax.i32(i32 [[TMP0]], i32 0)
190+
; CHECK-NEXT: [[UMIN:%.*]] = tail call i32 @llvm.umin.i32(i32 [[SMAX]], i32 1)
191+
; CHECK-NEXT: [[UMIN_EXT:%.*]] = zext i32 [[UMIN]] to i64
192+
; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[UMIN_EXT]]
193+
; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[GEP_SRC_2]], align 1
194+
; CHECK-NEXT: [[L_EXT:%.*]] = zext i8 [[TMP1]] to i32
195+
; CHECK-NEXT: [[MUL:%.*]] = mul i32 3, [[L_EXT]]
196+
; CHECK-NEXT: store i32 [[MUL]], ptr [[DST_1]], align 4
197+
; CHECK-NEXT: [[GEP_DST_2:%.*]] = getelementptr i32, ptr [[DST_2]], i64 [[IV]]
198+
; CHECK-NEXT: store i32 0, ptr [[GEP_DST_2]], align 4
199+
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
200+
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 0
201+
; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
202+
; CHECK: [[EXIT]]:
203+
; CHECK-NEXT: ret void
204+
;
205+
entry:
206+
br label %loop
207+
208+
loop:
209+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
210+
%0 = load i32, ptr %src.1, align 4
211+
%smax = tail call i32 @llvm.smax.i32(i32 %0, i32 0)
212+
%umin = tail call i32 @llvm.umin.i32(i32 %smax, i32 1)
213+
%umin.ext = zext i32 %umin to i64
214+
%gep.src.2 = getelementptr i8, ptr %src.2, i64 %umin.ext
215+
%1 = load i8, ptr %gep.src.2, align 1
216+
%l.ext = zext i8 %1 to i32
217+
%mul = mul i32 3, %l.ext
218+
store i32 %mul, ptr %dst.1, align 4
219+
%gep.dst.2 = getelementptr i32, ptr %dst.2, i64 %iv
220+
store i32 0, ptr %gep.dst.2, align 4
221+
%iv.next = add i64 %iv, 1
222+
%ec = icmp eq i64 %iv.next, 0
223+
br i1 %ec, label %exit, label %loop
224+
225+
exit:
226+
ret void
227+
}
228+
229+
declare i32 @llvm.smax.i32(i32, i32)
230+
declare i32 @llvm.umin.i32(i32, i32)
231+
declare double @llvm.sqrt.f64(double)
182232
declare double @llvm.powi.f64.i32(double, i32)
183233
declare i64 @llvm.fshl.i64(i64, i64, i64)
184234
;.

0 commit comments

Comments
 (0)