Skip to content

Commit 97fcc44

Browse files
committed
[CodeGen] Avoid sinking vector comparisons during CodeGenPrepare
Whilst reviewing PR llvm#109289 and doing some analysis with various tests involving predicated blocks I noticed that we're making codegen and performance worse by sinking vector comparisons multiple times into blocks. It looks like the sinkCmpExpression in CodeGenPrepare was written for scalar comparisons where there is only a single condition register, whereas vector comparisons typically produce a vector result. For some targets, such a NEON or SVE, there are multiple allocatable vector registers that can store the result and so we should avoid sinking in that case.
1 parent 6b93bd0 commit 97fcc44

File tree

5 files changed

+69
-43
lines changed

5 files changed

+69
-43
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -502,6 +502,11 @@ class TargetLoweringBase {
502502
return HasMultipleConditionRegisters;
503503
}
504504

505+
/// Return true if multiple vector predicate registers are available.
506+
bool hasMultipleVectorPredicateRegisters() const {
507+
return HasMultipleVectorPredicateRegisters;
508+
}
509+
505510
/// Return true if the target has BitExtract instructions.
506511
bool hasExtractBitsInsn() const { return HasExtractBitsInsn; }
507512

@@ -2505,6 +2510,15 @@ class TargetLoweringBase {
25052510
HasMultipleConditionRegisters = hasManyRegs;
25062511
}
25072512

2513+
/// Tells the code generator that the target has multiple (allocatable)
2514+
/// vector predicate registers that can be used to store the results of
2515+
/// vector comparisons. With multiple predicate registers, the code
2516+
/// generator will not aggressively sink vector comparisons into the blocks
2517+
/// of their users.
2518+
void setHasMultipleVectorPredicateRegisters(bool hasManyRegs = true) {
2519+
HasMultipleVectorPredicateRegisters = hasManyRegs;
2520+
}
2521+
25082522
/// Tells the code generator that the target has BitExtract instructions.
25092523
/// The code generator will aggressively sink "shift"s into the blocks of
25102524
/// their users if the users will generate "and" instructions which can be
@@ -3477,6 +3491,11 @@ class TargetLoweringBase {
34773491
/// the blocks of their users.
34783492
bool HasMultipleConditionRegisters;
34793493

3494+
/// Tells the code generator that the target has multiple (allocatable)
3495+
/// vector predicate registers that can be used to store the results of
3496+
/// vector comparisons.
3497+
bool HasMultipleVectorPredicateRegisters;
3498+
34803499
/// Tells the code generator that the target has BitExtract instructions.
34813500
/// The code generator will aggressively sink "shift"s into the blocks of
34823501
/// their users if the users will generate "and" instructions which can be

llvm/lib/CodeGen/CodeGenPrepare.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1775,6 +1775,13 @@ static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) {
17751775
if (TLI.hasMultipleConditionRegisters())
17761776
return false;
17771777

1778+
// If this is a vector comparison the result may not depend upon setting a
1779+
// condition register, and if so it's probably better not to sink.
1780+
VectorType *VecType = dyn_cast<VectorType>(Cmp->getType());
1781+
if (VecType && VecType->getElementCount().isVector() &&
1782+
TLI.hasMultipleVectorPredicateRegisters())
1783+
return false;
1784+
17781785
// Avoid sinking soft-FP comparisons, since this can move them into a loop.
17791786
if (TLI.useSoftFloat() && isa<FCmpInst>(Cmp))
17801787
return false;

llvm/lib/CodeGen/TargetLoweringBase.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -626,6 +626,7 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm)
626626
MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize =
627627
MaxStoresPerMemmoveOptSize = MaxLoadsPerMemcmpOptSize = 4;
628628
HasMultipleConditionRegisters = false;
629+
HasMultipleVectorPredicateRegisters = false;
629630
HasExtractBitsInsn = false;
630631
JumpIsExpensive = JumpIsExpensiveOverride;
631632
PredictableSelectIsExpensive = false;

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -473,6 +473,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
473473
// Compute derived properties from the register classes
474474
computeRegisterProperties(Subtarget->getRegisterInfo());
475475

476+
if (Subtarget->hasNEON() || Subtarget->hasSVE())
477+
setHasMultipleVectorPredicateRegisters(true);
478+
476479
// Provide all sorts of operation actions
477480
setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
478481
setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);

llvm/test/CodeGen/AArch64/no-sink-vector-cmp.ll

Lines changed: 39 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -6,68 +6,64 @@ target triple = "aarch64-unknown-linux-gnu"
66
define void @vector_loop_with_icmp(ptr nocapture noundef writeonly %dest) {
77
; CHECK-LABEL: vector_loop_with_icmp:
88
; CHECK: // %bb.0: // %entry
9-
; CHECK-NEXT: mov w8, #15 // =0xf
9+
; CHECK-NEXT: mov w9, #15 // =0xf
1010
; CHECK-NEXT: mov w10, #4 // =0x4
11-
; CHECK-NEXT: adrp x9, .LCPI0_0
11+
; CHECK-NEXT: adrp x8, .LCPI0_0
1212
; CHECK-NEXT: adrp x11, .LCPI0_1
13-
; CHECK-NEXT: dup v0.2d, x8
13+
; CHECK-NEXT: dup v0.2d, x9
1414
; CHECK-NEXT: dup v1.2d, x10
15-
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI0_0]
15+
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_0]
1616
; CHECK-NEXT: ldr q3, [x11, :lo12:.LCPI0_1]
17-
; CHECK-NEXT: add x9, x0, #8
18-
; CHECK-NEXT: mov w10, #16 // =0x10
19-
; CHECK-NEXT: mov w11, #1 // =0x1
17+
; CHECK-NEXT: add x8, x0, #8
18+
; CHECK-NEXT: mov w9, #16 // =0x10
19+
; CHECK-NEXT: mov w10, #1 // =0x1
2020
; CHECK-NEXT: b .LBB0_2
2121
; CHECK-NEXT: .LBB0_1: // %pred.store.continue18
2222
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
2323
; CHECK-NEXT: add v2.2d, v2.2d, v1.2d
2424
; CHECK-NEXT: add v3.2d, v3.2d, v1.2d
25-
; CHECK-NEXT: subs x10, x10, #4
26-
; CHECK-NEXT: add x9, x9, #16
25+
; CHECK-NEXT: subs x9, x9, #4
26+
; CHECK-NEXT: add x8, x8, #16
2727
; CHECK-NEXT: b.eq .LBB0_10
2828
; CHECK-NEXT: .LBB0_2: // %vector.body
2929
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
30-
; CHECK-NEXT: cmhi v4.2d, v0.2d, v3.2d
31-
; CHECK-NEXT: xtn v4.2s, v4.2d
32-
; CHECK-NEXT: uzp1 v4.4h, v4.4h, v0.4h
33-
; CHECK-NEXT: umov w12, v4.h[0]
34-
; CHECK-NEXT: tbz w12, #0, .LBB0_4
35-
; CHECK-NEXT: // %bb.3: // %pred.store.if
30+
; CHECK-NEXT: cmhi v4.2d, v0.2d, v2.2d
31+
; CHECK-NEXT: cmhi v5.2d, v0.2d, v3.2d
32+
; CHECK-NEXT: uzp1 v4.4s, v5.4s, v4.4s
33+
; CHECK-NEXT: xtn v4.4h, v4.4s
34+
; CHECK-NEXT: umov w11, v4.h[0]
35+
; CHECK-NEXT: tbnz w11, #0, .LBB0_6
36+
; CHECK-NEXT: // %bb.3: // %pred.store.continue
3637
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
37-
; CHECK-NEXT: stur w11, [x9, #-8]
38-
; CHECK-NEXT: .LBB0_4: // %pred.store.continue
38+
; CHECK-NEXT: umov w11, v4.h[1]
39+
; CHECK-NEXT: tbnz w11, #0, .LBB0_7
40+
; CHECK-NEXT: .LBB0_4: // %pred.store.continue6
3941
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
40-
; CHECK-NEXT: dup v4.2d, x8
41-
; CHECK-NEXT: cmhi v4.2d, v4.2d, v3.2d
42-
; CHECK-NEXT: xtn v4.2s, v4.2d
43-
; CHECK-NEXT: uzp1 v4.4h, v4.4h, v0.4h
44-
; CHECK-NEXT: umov w12, v4.h[1]
45-
; CHECK-NEXT: tbz w12, #0, .LBB0_6
46-
; CHECK-NEXT: // %bb.5: // %pred.store.if5
42+
; CHECK-NEXT: umov w11, v4.h[2]
43+
; CHECK-NEXT: tbnz w11, #0, .LBB0_8
44+
; CHECK-NEXT: .LBB0_5: // %pred.store.continue8
4745
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
48-
; CHECK-NEXT: stur w11, [x9, #-4]
49-
; CHECK-NEXT: .LBB0_6: // %pred.store.continue6
46+
; CHECK-NEXT: umov w11, v4.h[3]
47+
; CHECK-NEXT: tbz w11, #0, .LBB0_1
48+
; CHECK-NEXT: b .LBB0_9
49+
; CHECK-NEXT: .LBB0_6: // %pred.store.if
5050
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
51-
; CHECK-NEXT: dup v4.2d, x8
52-
; CHECK-NEXT: cmhi v4.2d, v4.2d, v2.2d
53-
; CHECK-NEXT: xtn v4.2s, v4.2d
54-
; CHECK-NEXT: uzp1 v4.4h, v0.4h, v4.4h
55-
; CHECK-NEXT: umov w12, v4.h[2]
56-
; CHECK-NEXT: tbz w12, #0, .LBB0_8
57-
; CHECK-NEXT: // %bb.7: // %pred.store.if7
51+
; CHECK-NEXT: stur w10, [x8, #-8]
52+
; CHECK-NEXT: umov w11, v4.h[1]
53+
; CHECK-NEXT: tbz w11, #0, .LBB0_4
54+
; CHECK-NEXT: .LBB0_7: // %pred.store.if5
5855
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
59-
; CHECK-NEXT: str w11, [x9]
60-
; CHECK-NEXT: .LBB0_8: // %pred.store.continue8
56+
; CHECK-NEXT: stur w10, [x8, #-4]
57+
; CHECK-NEXT: umov w11, v4.h[2]
58+
; CHECK-NEXT: tbz w11, #0, .LBB0_5
59+
; CHECK-NEXT: .LBB0_8: // %pred.store.if7
6160
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
62-
; CHECK-NEXT: dup v4.2d, x8
63-
; CHECK-NEXT: cmhi v4.2d, v4.2d, v2.2d
64-
; CHECK-NEXT: xtn v4.2s, v4.2d
65-
; CHECK-NEXT: uzp1 v4.4h, v0.4h, v4.4h
66-
; CHECK-NEXT: umov w12, v4.h[3]
67-
; CHECK-NEXT: tbz w12, #0, .LBB0_1
68-
; CHECK-NEXT: // %bb.9: // %pred.store.if9
61+
; CHECK-NEXT: str w10, [x8]
62+
; CHECK-NEXT: umov w11, v4.h[3]
63+
; CHECK-NEXT: tbz w11, #0, .LBB0_1
64+
; CHECK-NEXT: .LBB0_9: // %pred.store.if9
6965
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
70-
; CHECK-NEXT: str w11, [x9, #4]
66+
; CHECK-NEXT: str w10, [x8, #4]
7167
; CHECK-NEXT: b .LBB0_1
7268
; CHECK-NEXT: .LBB0_10: // %for.cond.cleanup
7369
; CHECK-NEXT: ret

0 commit comments

Comments
 (0)