Skip to content

Commit e8e45f5

Browse files
htyutstellar
authored andcommitted
[CSSPGO] Unblock optimizations with pseudo probe instrumentation.
The IR/MIR pseudo probe intrinsics don't get materialized into real machine instructions and therefore they don't incur runtime cost directly. However, they come with indirect cost by blocking certain optimizations. Some of the blocking are intentional (such as blocking code merge) for better counts quality while the others are accidental. This change unblocks perf-critical optimizations that do not affect counts quality. They include: 1. IR InstCombine, sinking load operation to shorten lifetimes. 2. MIR LiveRangeShrink, similar to #1 3. MIR TwoAddressInstructionPass, i.e, opeq transform 4. MIR function argument copy elision 5. IR stack protection. (though not perf-critical but nice to have). Reviewed By: wmi Differential Revision: https://reviews.llvm.org/D95982
1 parent 1071279 commit e8e45f5

15 files changed

+209
-13
lines changed

llvm/include/llvm/CodeGen/MachineInstr.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1156,6 +1156,10 @@ class MachineInstr
11561156
return getOpcode() == TargetOpcode::CFI_INSTRUCTION;
11571157
}
11581158

1159+
bool isPseudoProbe() const {
1160+
return getOpcode() == TargetOpcode::PSEUDO_PROBE;
1161+
}
1162+
11591163
// True if the instruction represents a position in the function.
11601164
bool isPosition() const { return isLabel() || isCFIInstruction(); }
11611165

@@ -1165,6 +1169,9 @@ class MachineInstr
11651169
bool isDebugInstr() const {
11661170
return isDebugValue() || isDebugLabel() || isDebugRef();
11671171
}
1172+
bool isDebugOrPseudoInstr() const {
1173+
return isDebugInstr() || isPseudoProbe();
1174+
}
11681175

11691176
bool isDebugOffsetImm() const { return getDebugOffset().isImm(); }
11701177

llvm/include/llvm/IR/Instruction.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -654,6 +654,9 @@ class Instruction : public User,
654654
/// llvm.lifetime.end marker.
655655
bool isLifetimeStartOrEnd() const;
656656

657+
/// Return true if the instruction is a DbgInfoIntrinsic or PseudoProbeInst.
658+
bool isDebugOrPseudoInst() const;
659+
657660
/// Return a pointer to the next non-debug instruction in the same basic
658661
/// block as 'this', or nullptr if no such instruction exists. Skip any pseudo
659662
/// operations if \c SkipPseudoOp is true.

llvm/lib/CodeGen/LiveRangeShrink.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,8 @@ bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) {
156156
// If MI has side effects, it should become a barrier for code motion.
157157
// IOM is rebuild from the next instruction to prevent later
158158
// instructions from being moved before this MI.
159-
if (MI.hasUnmodeledSideEffects() && Next != MBB.end()) {
159+
if (MI.hasUnmodeledSideEffects() && !MI.isPseudoProbe() &&
160+
Next != MBB.end()) {
160161
BuildInstOrderMap(Next, IOM);
161162
SawStore = false;
162163
}

llvm/lib/CodeGen/MachineInstr.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1462,7 +1462,8 @@ bool MachineInstr::hasUnmodeledSideEffects() const {
14621462
}
14631463

14641464
bool MachineInstr::isLoadFoldBarrier() const {
1465-
return mayStore() || isCall() || hasUnmodeledSideEffects();
1465+
return mayStore() || isCall() ||
1466+
(hasUnmodeledSideEffects() && !isPseudoProbe());
14661467
}
14671468

14681469
/// allDefsAreDead - Return true if all the defs of this instruction are dead.

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9660,8 +9660,9 @@ findArgumentCopyElisionCandidates(const DataLayout &DL,
96609660
// We will look through cast uses, so ignore them completely.
96619661
if (I.isCast())
96629662
continue;
9663-
// Ignore debug info intrinsics, they don't escape or store to allocas.
9664-
if (isa<DbgInfoIntrinsic>(I))
9663+
// Ignore debug info and pseudo op intrinsics, they don't escape or store
9664+
// to allocas.
9665+
if (I.isDebugOrPseudoInst())
96659666
continue;
96669667
// This is an unknown instruction. Assume it escapes or writes to all
96679668
// static alloca operands.

llvm/lib/CodeGen/StackProtector.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ bool StackProtector::HasAddressTaken(const Instruction *AI,
192192
// Ignore intrinsics that do not become real instructions.
193193
// TODO: Narrow this to intrinsics that have store-like effects.
194194
const auto *CI = cast<CallInst>(I);
195-
if (!isa<DbgInfoIntrinsic>(CI) && !CI->isLifetimeStartOrEnd())
195+
if (!CI->isDebugOrPseudoInst() && !CI->isLifetimeStartOrEnd())
196196
return true;
197197
break;
198198
}

llvm/lib/CodeGen/TwoAddressInstructionPass.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -801,8 +801,8 @@ bool TwoAddressInstructionPass::rescheduleMIBelowKill(
801801
MachineBasicBlock::iterator KillPos = KillMI;
802802
++KillPos;
803803
for (MachineInstr &OtherMI : make_range(End, KillPos)) {
804-
// Debug instructions cannot be counted against the limit.
805-
if (OtherMI.isDebugInstr())
804+
// Debug or pseudo instructions cannot be counted against the limit.
805+
if (OtherMI.isDebugOrPseudoInstr())
806806
continue;
807807
if (NumVisited > 10) // FIXME: Arbitrary limit to reduce compile time cost.
808808
return false;
@@ -974,8 +974,8 @@ bool TwoAddressInstructionPass::rescheduleKillAboveMI(
974974
unsigned NumVisited = 0;
975975
for (MachineInstr &OtherMI :
976976
make_range(mi, MachineBasicBlock::iterator(KillMI))) {
977-
// Debug instructions cannot be counted against the limit.
978-
if (OtherMI.isDebugInstr())
977+
// Debug or pseudo instructions cannot be counted against the limit.
978+
if (OtherMI.isDebugOrPseudoInstr())
979979
continue;
980980
if (NumVisited > 10) // FIXME: Arbitrary limit to reduce compile time cost.
981981
return false;

llvm/lib/IR/Instruction.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -651,6 +651,10 @@ bool Instruction::isLifetimeStartOrEnd() const {
651651
return ID == Intrinsic::lifetime_start || ID == Intrinsic::lifetime_end;
652652
}
653653

654+
bool Instruction::isDebugOrPseudoInst() const {
655+
return isa<DbgInfoIntrinsic>(this) || isa<PseudoProbeInst>(this);
656+
}
657+
654658
const Instruction *
655659
Instruction::getNextNonDebugInstruction(bool SkipPseudoOp) const {
656660
for (const Instruction *I = getNextNode(); I; I = I->getNextNode())

llvm/lib/Transforms/IPO/FunctionAttrs.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,13 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody,
149149
if (isNoModRef(MRI))
150150
continue;
151151

152+
// A pseudo probe call shouldn't change any function attribute since it
153+
// doesn't translate to a real instruction. It comes with a memory access
154+
// tag to prevent itself being removed by optimizations and not block
155+
// other instructions being optimized.
156+
if (isa<PseudoProbeInst>(I))
157+
continue;
158+
152159
if (!AliasAnalysis::onlyAccessesArgPointees(MRB)) {
153160
// The call could access any memory. If that includes writes, note it.
154161
if (isModSet(MRI))

llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -592,8 +592,14 @@ static bool isSafeAndProfitableToSinkLoad(LoadInst *L) {
592592
BasicBlock::iterator BBI = L->getIterator(), E = L->getParent()->end();
593593

594594
for (++BBI; BBI != E; ++BBI)
595-
if (BBI->mayWriteToMemory())
595+
if (BBI->mayWriteToMemory()) {
596+
// Calls that only access inaccessible memory do not block sinking the
597+
// load.
598+
if (auto *CB = dyn_cast<CallBase>(BBI))
599+
if (CB->onlyAccessesInaccessibleMemory())
600+
continue;
596601
return false;
602+
}
597603

598604
// Check for non-address taken alloca. If not address-taken already, it isn't
599605
// profitable to do this xform.

llvm/lib/Transforms/InstCombine/InstructionCombining.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3878,9 +3878,10 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,
38783878
}
38793879
}
38803880

3881-
// Skip processing debug intrinsics in InstCombine. Processing these call instructions
3882-
// consumes non-trivial amount of time and provides no value for the optimization.
3883-
if (!isa<DbgInfoIntrinsic>(Inst)) {
3881+
// Skip processing debug and pseudo intrinsics in InstCombine. Processing
3882+
// these call instructions consumes non-trivial amount of time and
3883+
// provides no value for the optimization.
3884+
if (!Inst->isDebugOrPseudoInst()) {
38843885
InstrsForInstCombineWorklist.push_back(Inst);
38853886
SeenAliasScopes.analyse(Inst);
38863887
}
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
; RUN: opt -passes=instcombine -S < %s | FileCheck %s
2+
3+
%struct.nonbonded = type { [2 x %struct.CompAtom*], [2 x %struct.CompAtomExt*], [2 x %struct.CompAtom*], [2 x %class.Vector*], [2 x %class.Vector*], [2 x i32], %class.Vector, double*, double*, %class.ComputeNonbondedWorkArrays*, %class.Pairlists*, i32, i32, double, double, i32, i32, i32, i32 }
4+
%struct.CompAtomExt = type { i32 }
5+
%struct.CompAtom = type { %class.Vector, float, i16, i8, i8 }
6+
%class.Vector = type { double, double, double }
7+
%class.ComputeNonbondedWorkArrays = type { %class.ResizeArray, %class.ResizeArray.0, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray.2, %class.ResizeArray.2 }
8+
%class.ResizeArray.0 = type { i32 (...)**, %class.ResizeArrayRaw.1* }
9+
%class.ResizeArrayRaw.1 = type <{ double*, i8*, i32, i32, i32, float, i32, [4 x i8] }>
10+
%class.ResizeArray = type { i32 (...)**, %class.ResizeArrayRaw* }
11+
%class.ResizeArrayRaw = type <{ i16*, i8*, i32, i32, i32, float, i32, [4 x i8] }>
12+
%class.ResizeArray.2 = type { i32 (...)**, %class.ResizeArrayRaw.3* }
13+
%class.ResizeArrayRaw.3 = type <{ %class.Vector*, i8*, i32, i32, i32, float, i32, [4 x i8] }>
14+
%class.Pairlists = type { i16*, i32, i32 }
15+
16+
;; Check the minPart4 and minPart assignments are merged.
17+
; CHECK-COUNT-1: getelementptr inbounds %struct.nonbonded, %struct.nonbonded* %params, i64 0, i32 16
18+
; CHECK-NOT: getelementptr inbounds %struct.nonbonded, %struct.nonbonded* %params, i64 0, i32 16
19+
20+
define dso_local void @_ZN20ComputeNonbondedUtil9calc_pairEP9nonbonded(%struct.nonbonded* nocapture readonly %params) local_unnamed_addr align 2 {
21+
entry:
22+
%savePairlists3 = getelementptr inbounds %struct.nonbonded, %struct.nonbonded* %params, i64 0, i32 11
23+
%0 = load i32, i32* %savePairlists3, align 8
24+
%usePairlists4 = getelementptr inbounds %struct.nonbonded, %struct.nonbonded* %params, i64 0, i32 12
25+
%1 = load i32, i32* %usePairlists4, align 4
26+
%tobool54.not = icmp eq i32 %0, 0
27+
br i1 %tobool54.not, label %lor.lhs.false55, label %if.end109
28+
29+
lor.lhs.false55: ; preds = %entry
30+
%tobool56.not = icmp eq i32 %1, 0
31+
br i1 %tobool56.not, label %if.end109, label %if.end109.thread
32+
33+
if.end109.thread: ; preds = %lor.lhs.false55
34+
%minPart4 = getelementptr inbounds %struct.nonbonded, %struct.nonbonded* %params, i64 0, i32 16
35+
%2 = load i32, i32* %minPart4, align 4
36+
call void @llvm.pseudoprobe(i64 -6172701105289426098, i64 2, i32 0, i64 -1)
37+
br label %if.then138
38+
39+
if.end109: ; preds = %lor.lhs.false55, %entry
40+
%minPart = getelementptr inbounds %struct.nonbonded, %struct.nonbonded* %params, i64 0, i32 16
41+
%3 = load i32, i32* %minPart, align 4
42+
call void @llvm.pseudoprobe(i64 -6172701105289426098, i64 3, i32 0, i64 -1)
43+
%tobool116.not = icmp eq i32 %1, 0
44+
br i1 %tobool116.not, label %if.then117, label %if.then138
45+
46+
if.then117: ; preds = %if.end109
47+
ret void
48+
49+
if.then138: ; preds = %if.end109.thread, %if.end109
50+
%4 = phi i32 [ %2, %if.end109.thread ], [ %3, %if.end109 ]
51+
%tobool139.not = icmp eq i32 %4, 0
52+
br i1 %tobool139.not, label %if.else147, label %if.then140
53+
54+
if.then140: ; preds = %if.then138
55+
ret void
56+
57+
if.else147: ; preds = %if.then138
58+
ret void
59+
}
60+
61+
declare dso_local void @_ZN9Pairlists8addIndexEv() align 2
62+
63+
; Function Attrs: inaccessiblememonly nounwind willreturn
64+
declare void @llvm.pseudoprobe(i64, i64, i32, i64) #0
65+
66+
attributes #0 = { inaccessiblememonly nounwind willreturn }
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
; PR1075
2+
; RUN: llc < %s -mcpu=generic -mtriple=x86_64-apple-darwin -pseudo-probe-for-profiling -O3 | FileCheck %s
3+
4+
define float @foo(float %x) #0 {
5+
%tmp1 = fmul float %x, 3.000000e+00
6+
%tmp3 = fmul float %x, 5.000000e+00
7+
%tmp5 = fmul float %x, 7.000000e+00
8+
%tmp7 = fmul float %x, 1.100000e+01
9+
call void @llvm.pseudoprobe(i64 6699318081062747564, i64 1, i32 0, i64 -1)
10+
%tmp10 = fadd float %tmp1, %tmp3
11+
%tmp12 = fadd float %tmp10, %tmp5
12+
%tmp14 = fadd float %tmp12, %tmp7
13+
ret float %tmp14
14+
; CHECK: mulss
15+
; CHECK: mulss
16+
; CHECK: addss
17+
; CHECK: mulss
18+
; CHECK: addss
19+
; CHECK: mulss
20+
; CHECK: addss
21+
; CHECK: ret
22+
}
23+
24+
; Function Attrs: inaccessiblememonly nounwind willreturn
25+
declare void @llvm.pseudoprobe(i64, i64, i32, i64) #1
26+
27+
attributes #0 = { nounwind }
28+
attributes #1 = { inaccessiblememonly nounwind willreturn }
29+
30+
!llvm.pseudo_probe_desc = !{!0}
31+
32+
!0 = !{i64 6699318081062747564, i64 4294967295, !"foo", null}
33+
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
; RUN: llc -mtriple=x86_64-- -stop-after=peephole-opt -o - %s | FileCheck %s
2+
3+
define internal i32 @arc_compare() {
4+
entry:
5+
%0 = load i64, i64* undef, align 8
6+
br i1 undef, label %return, label %if.end
7+
8+
if.end: ; preds = %entry
9+
; Chek a register copy has been sinked into the compare instruction.
10+
; CHECK: %[[#REG:]]:gr64 = IMPLICIT_DEF
11+
; CHECK-NOT: %[[#]]:gr64 = MOV64rm %[[#REG]]
12+
; CHECK: PSEUDO_PROBE 5116412291814990879, 3, 0, 0
13+
; CHECK: CMP64mr %[[#REG]], 1
14+
call void @llvm.pseudoprobe(i64 5116412291814990879, i64 3, i32 0, i64 -1)
15+
%cmp4 = icmp slt i64 %0, undef
16+
br i1 %cmp4, label %return, label %if.end6
17+
18+
if.end6: ; preds = %if.end
19+
call void @llvm.pseudoprobe(i64 5116412291814990879, i64 5, i32 0, i64 -1)
20+
br label %return
21+
22+
return: ; preds = %if.end6, %if.end, %entry
23+
ret i32 undef
24+
}
25+
26+
; Function Attrs: inaccessiblememonly nounwind willreturn
27+
declare void @llvm.pseudoprobe(i64, i64, i32, i64) #0
28+
29+
attributes #0 = { inaccessiblememonly nounwind willreturn }
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
; RUN: llc -stop-after=twoaddressinstruction -mtriple=x86_64-- -o - %s | FileCheck %s
2+
3+
4+
define dso_local double @twoaddressinstruction() local_unnamed_addr {
5+
for.end:
6+
%0 = load i64, i64* undef, align 8
7+
br label %for.body14.preheader
8+
9+
for.body14.preheader: ; preds = %for.end
10+
br i1 undef, label %for.cond25.preheader.loopexit.unr-lcssa, label %for.body14.preheader.new
11+
12+
for.body14.preheader.new: ; preds = %for.body14.preheader
13+
%unroll_iter136 = and i64 %0, -4
14+
br label %for.body14
15+
16+
for.cond25.preheader.loopexit.unr-lcssa: ; preds = %for.body14, %for.body14.preheader
17+
%indvars.iv127.unr = phi i64 [ 1, %for.body14.preheader ], [ %indvars.iv.next128.3, %for.body14 ]
18+
ret double undef
19+
20+
for.body14: ; preds = %for.body14, %for.body14.preheader.new
21+
%indvars.iv127 = phi i64 [ 1, %for.body14.preheader.new ], [ %indvars.iv.next128.3, %for.body14 ]
22+
%niter137 = phi i64 [ %unroll_iter136, %for.body14.preheader.new ], [ %niter137.nsub.3, %for.body14 ]
23+
%indvars.iv.next128.3 = add nuw nsw i64 %indvars.iv127, 4
24+
; CHECK: PSEUDO_PROBE -6878943695821059507, 9, 0, 0
25+
call void @llvm.pseudoprobe(i64 -6878943695821059507, i64 9, i32 0, i64 -1)
26+
;; Check an opeq form of instruction is created.
27+
; CHECK: %[[#REG:]]:gr64_nosp = COPY killed %[[#]]
28+
; CHECK: %[[#REG]]:gr64_nosp = nuw ADD64ri8 %[[#REG]], 4, implicit-def dead $eflags
29+
%niter137.nsub.3 = add i64 %niter137, -4
30+
%niter137.ncmp.3 = icmp eq i64 %niter137.nsub.3, 0
31+
br i1 %niter137.ncmp.3, label %for.cond25.preheader.loopexit.unr-lcssa, label %for.body14
32+
}
33+
34+
; Function Attrs: inaccessiblememonly nounwind willreturn
35+
declare void @llvm.pseudoprobe(i64, i64, i32, i64) #0
36+
37+
attributes #0 = { inaccessiblememonly nounwind willreturn }

0 commit comments

Comments
 (0)