Skip to content

Commit cbdccb3

Browse files
committed
[RA] Split a virtual register in cold blocks if it is not assigned preferred physical register
If a virtual register is not assigned preferred physical register, it means some COPY instructions will be changed to real register move instructions. In this case we can try to split the virtual register in colder blocks, if success, the original COPY instructions can be deleted, and the new COPY instructions in colder blocks will be generated as register move instructions. It results in fewer dynamic register move instructions executed. The new test case split-reg-with-hint.ll gives an example, the hot path contains 24 instructions without this patch, now it is only 4 instructions with this patch. Differential Revision: https://reviews.llvm.org/D156491
1 parent 01e80a0 commit cbdccb3

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+899
-617
lines changed

llvm/lib/CodeGen/RegAllocGreedy.cpp

Lines changed: 142 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,12 @@ static cl::opt<bool> GreedyReverseLocalAssignment(
141141
"shorter local live ranges will tend to be allocated first"),
142142
cl::Hidden);
143143

144+
static cl::opt<unsigned> SplitThresholdForRegWithHint(
145+
"split-threshold-for-reg-with-hint",
146+
cl::desc("The threshold for splitting a virtual register with a hint, in "
147+
"percentate"),
148+
cl::init(75), cl::Hidden);
149+
144150
static RegisterRegAlloc greedyRegAlloc("greedy", "greedy register allocator",
145151
createGreedyRegisterAllocator);
146152

@@ -422,6 +428,11 @@ MCRegister RAGreedy::tryAssign(const LiveInterval &VirtReg,
422428
evictInterference(VirtReg, PhysHint, NewVRegs);
423429
return PhysHint;
424430
}
431+
432+
// We can also split the virtual register in cold blocks.
433+
if (trySplitAroundHintReg(PhysHint, VirtReg, NewVRegs, Order))
434+
return 0;
435+
425436
// Record the missed hint, we may be able to recover
426437
// at the end if the surrounding allocation changed.
427438
SetOfBrokenHints.insert(&VirtReg);
@@ -1064,86 +1075,98 @@ MCRegister RAGreedy::tryRegionSplit(const LiveInterval &VirtReg,
10641075
return doRegionSplit(VirtReg, BestCand, HasCompact, NewVRegs);
10651076
}
10661077

1067-
unsigned RAGreedy::calculateRegionSplitCost(const LiveInterval &VirtReg,
1078+
unsigned
1079+
RAGreedy::calculateRegionSplitCostAroundReg(MCPhysReg PhysReg,
10681080
AllocationOrder &Order,
10691081
BlockFrequency &BestCost,
10701082
unsigned &NumCands,
1071-
bool IgnoreCSR) {
1072-
unsigned BestCand = NoCand;
1073-
for (MCPhysReg PhysReg : Order) {
1074-
assert(PhysReg);
1075-
if (IgnoreCSR && EvictAdvisor->isUnusedCalleeSavedReg(PhysReg))
1076-
continue;
1077-
1078-
// Discard bad candidates before we run out of interference cache cursors.
1079-
// This will only affect register classes with a lot of registers (>32).
1080-
if (NumCands == IntfCache.getMaxCursors()) {
1081-
unsigned WorstCount = ~0u;
1082-
unsigned Worst = 0;
1083-
for (unsigned CandIndex = 0; CandIndex != NumCands; ++CandIndex) {
1084-
if (CandIndex == BestCand || !GlobalCand[CandIndex].PhysReg)
1085-
continue;
1086-
unsigned Count = GlobalCand[CandIndex].LiveBundles.count();
1087-
if (Count < WorstCount) {
1088-
Worst = CandIndex;
1089-
WorstCount = Count;
1090-
}
1083+
unsigned &BestCand) {
1084+
// Discard bad candidates before we run out of interference cache cursors.
1085+
// This will only affect register classes with a lot of registers (>32).
1086+
if (NumCands == IntfCache.getMaxCursors()) {
1087+
unsigned WorstCount = ~0u;
1088+
unsigned Worst = 0;
1089+
for (unsigned CandIndex = 0; CandIndex != NumCands; ++CandIndex) {
1090+
if (CandIndex == BestCand || !GlobalCand[CandIndex].PhysReg)
1091+
continue;
1092+
unsigned Count = GlobalCand[CandIndex].LiveBundles.count();
1093+
if (Count < WorstCount) {
1094+
Worst = CandIndex;
1095+
WorstCount = Count;
10911096
}
1092-
--NumCands;
1093-
GlobalCand[Worst] = GlobalCand[NumCands];
1094-
if (BestCand == NumCands)
1095-
BestCand = Worst;
10961097
}
1098+
--NumCands;
1099+
GlobalCand[Worst] = GlobalCand[NumCands];
1100+
if (BestCand == NumCands)
1101+
BestCand = Worst;
1102+
}
10971103

1098-
if (GlobalCand.size() <= NumCands)
1099-
GlobalCand.resize(NumCands+1);
1100-
GlobalSplitCandidate &Cand = GlobalCand[NumCands];
1101-
Cand.reset(IntfCache, PhysReg);
1104+
if (GlobalCand.size() <= NumCands)
1105+
GlobalCand.resize(NumCands+1);
1106+
GlobalSplitCandidate &Cand = GlobalCand[NumCands];
1107+
Cand.reset(IntfCache, PhysReg);
11021108

1103-
SpillPlacer->prepare(Cand.LiveBundles);
1104-
BlockFrequency Cost;
1105-
if (!addSplitConstraints(Cand.Intf, Cost)) {
1106-
LLVM_DEBUG(dbgs() << printReg(PhysReg, TRI) << "\tno positive bundles\n");
1107-
continue;
1108-
}
1109-
LLVM_DEBUG(dbgs() << printReg(PhysReg, TRI) << "\tstatic = ";
1110-
MBFI->printBlockFreq(dbgs(), Cost));
1111-
if (Cost >= BestCost) {
1112-
LLVM_DEBUG({
1113-
if (BestCand == NoCand)
1114-
dbgs() << " worse than no bundles\n";
1115-
else
1116-
dbgs() << " worse than "
1117-
<< printReg(GlobalCand[BestCand].PhysReg, TRI) << '\n';
1118-
});
1119-
continue;
1120-
}
1121-
if (!growRegion(Cand)) {
1122-
LLVM_DEBUG(dbgs() << ", cannot spill all interferences.\n");
1123-
continue;
1124-
}
1109+
SpillPlacer->prepare(Cand.LiveBundles);
1110+
BlockFrequency Cost;
1111+
if (!addSplitConstraints(Cand.Intf, Cost)) {
1112+
LLVM_DEBUG(dbgs() << printReg(PhysReg, TRI) << "\tno positive bundles\n");
1113+
return BestCand;
1114+
}
1115+
LLVM_DEBUG(dbgs() << printReg(PhysReg, TRI) << "\tstatic = ";
1116+
MBFI->printBlockFreq(dbgs(), Cost));
1117+
if (Cost >= BestCost) {
1118+
LLVM_DEBUG({
1119+
if (BestCand == NoCand)
1120+
dbgs() << " worse than no bundles\n";
1121+
else
1122+
dbgs() << " worse than "
1123+
<< printReg(GlobalCand[BestCand].PhysReg, TRI) << '\n';
1124+
});
1125+
return BestCand;
1126+
}
1127+
if (!growRegion(Cand)) {
1128+
LLVM_DEBUG(dbgs() << ", cannot spill all interferences.\n");
1129+
return BestCand;
1130+
}
1131+
1132+
SpillPlacer->finish();
1133+
1134+
// No live bundles, defer to splitSingleBlocks().
1135+
if (!Cand.LiveBundles.any()) {
1136+
LLVM_DEBUG(dbgs() << " no bundles.\n");
1137+
return BestCand;
1138+
}
1139+
1140+
Cost += calcGlobalSplitCost(Cand, Order);
1141+
LLVM_DEBUG({
1142+
dbgs() << ", total = ";
1143+
MBFI->printBlockFreq(dbgs(), Cost) << " with bundles";
1144+
for (int I : Cand.LiveBundles.set_bits())
1145+
dbgs() << " EB#" << I;
1146+
dbgs() << ".\n";
1147+
});
1148+
if (Cost < BestCost) {
1149+
BestCand = NumCands;
1150+
BestCost = Cost;
1151+
}
1152+
++NumCands;
11251153

1126-
SpillPlacer->finish();
1154+
return BestCand;
1155+
}
11271156

1128-
// No live bundles, defer to splitSingleBlocks().
1129-
if (!Cand.LiveBundles.any()) {
1130-
LLVM_DEBUG(dbgs() << " no bundles.\n");
1157+
unsigned RAGreedy::calculateRegionSplitCost(const LiveInterval &VirtReg,
1158+
AllocationOrder &Order,
1159+
BlockFrequency &BestCost,
1160+
unsigned &NumCands,
1161+
bool IgnoreCSR) {
1162+
unsigned BestCand = NoCand;
1163+
for (MCPhysReg PhysReg : Order) {
1164+
assert(PhysReg);
1165+
if (IgnoreCSR && EvictAdvisor->isUnusedCalleeSavedReg(PhysReg))
11311166
continue;
1132-
}
11331167

1134-
Cost += calcGlobalSplitCost(Cand, Order);
1135-
LLVM_DEBUG({
1136-
dbgs() << ", total = ";
1137-
MBFI->printBlockFreq(dbgs(), Cost) << " with bundles";
1138-
for (int I : Cand.LiveBundles.set_bits())
1139-
dbgs() << " EB#" << I;
1140-
dbgs() << ".\n";
1141-
});
1142-
if (Cost < BestCost) {
1143-
BestCand = NumCands;
1144-
BestCost = Cost;
1145-
}
1146-
++NumCands;
1168+
calculateRegionSplitCostAroundReg(PhysReg, Order, BestCost, NumCands,
1169+
BestCand);
11471170
}
11481171

11491172
return BestCand;
@@ -1189,6 +1212,53 @@ unsigned RAGreedy::doRegionSplit(const LiveInterval &VirtReg, unsigned BestCand,
11891212
return 0;
11901213
}
11911214

1215+
// VirtReg has a physical Hint, this function tries to split VirtReg around
1216+
// Hint if we can place new COPY instructions in cold blocks.
1217+
bool RAGreedy::trySplitAroundHintReg(MCPhysReg Hint,
1218+
const LiveInterval &VirtReg,
1219+
SmallVectorImpl<Register> &NewVRegs,
1220+
AllocationOrder &Order) {
1221+
BlockFrequency Cost = 0;
1222+
Register Reg = VirtReg.reg();
1223+
1224+
// Compute the cost of assigning a non Hint physical register to VirtReg.
1225+
// We define it as the total frequency of broken COPY instructions to/from
1226+
// Hint register, and after split, they can be deleted.
1227+
for (const MachineInstr &Instr : MRI->reg_nodbg_instructions(Reg)) {
1228+
if (!TII->isFullCopyInstr(Instr))
1229+
continue;
1230+
Register OtherReg = Instr.getOperand(1).getReg();
1231+
if (OtherReg == Reg) {
1232+
OtherReg = Instr.getOperand(0).getReg();
1233+
if (OtherReg == Reg)
1234+
continue;
1235+
// Check if VirtReg interferes with OtherReg after this COPY instruction.
1236+
if (VirtReg.liveAt(LIS->getInstructionIndex(Instr).getRegSlot()))
1237+
continue;
1238+
}
1239+
MCRegister OtherPhysReg =
1240+
OtherReg.isPhysical() ? OtherReg.asMCReg() : VRM->getPhys(OtherReg);
1241+
if (OtherPhysReg == Hint)
1242+
Cost += MBFI->getBlockFreq(Instr.getParent());
1243+
}
1244+
1245+
// Decrease the cost so it will be split in colder blocks.
1246+
BranchProbability Threshold(SplitThresholdForRegWithHint, 100);
1247+
Cost *= Threshold;
1248+
if (Cost == 0)
1249+
return false;
1250+
1251+
unsigned NumCands = 0;
1252+
unsigned BestCand = NoCand;
1253+
SA->analyze(&VirtReg);
1254+
calculateRegionSplitCostAroundReg(Hint, Order, Cost, NumCands, BestCand);
1255+
if (BestCand == NoCand)
1256+
return false;
1257+
1258+
doRegionSplit(VirtReg, BestCand, false/*HasCompact*/, NewVRegs);
1259+
return true;
1260+
}
1261+
11921262
//===----------------------------------------------------------------------===//
11931263
// Per-Block Splitting
11941264
//===----------------------------------------------------------------------===//
@@ -2329,6 +2399,9 @@ MCRegister RAGreedy::selectOrSplitImpl(const LiveInterval &VirtReg,
23292399
} else
23302400
return PhysReg;
23312401
}
2402+
// Non emtpy NewVRegs means VirtReg has been split.
2403+
if (!NewVRegs.empty())
2404+
return 0;
23322405

23332406
LiveRangeStage Stage = ExtraInfo->getStage(VirtReg);
23342407
LLVM_DEBUG(dbgs() << StageName[Stage] << " Cascade "

llvm/lib/CodeGen/RegAllocGreedy.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,12 @@ class LLVM_LIBRARY_VISIBILITY RAGreedy : public MachineFunctionPass,
348348
const SmallVirtRegSet &);
349349
MCRegister tryRegionSplit(const LiveInterval &, AllocationOrder &,
350350
SmallVectorImpl<Register> &);
351+
/// Calculate cost of region splitting around the specified register.
352+
unsigned calculateRegionSplitCostAroundReg(MCPhysReg PhysReg,
353+
AllocationOrder &Order,
354+
BlockFrequency &BestCost,
355+
unsigned &NumCands,
356+
unsigned &BestCand);
351357
/// Calculate cost of region splitting.
352358
unsigned calculateRegionSplitCost(const LiveInterval &VirtReg,
353359
AllocationOrder &Order,
@@ -356,6 +362,10 @@ class LLVM_LIBRARY_VISIBILITY RAGreedy : public MachineFunctionPass,
356362
/// Perform region splitting.
357363
unsigned doRegionSplit(const LiveInterval &VirtReg, unsigned BestCand,
358364
bool HasCompact, SmallVectorImpl<Register> &NewVRegs);
365+
/// Try to split VirtReg around physical Hint register.
366+
bool trySplitAroundHintReg(MCPhysReg Hint, const LiveInterval &VirtReg,
367+
SmallVectorImpl<Register> &NewVRegs,
368+
AllocationOrder &Order);
359369
/// Check other options before using a callee-saved register for the first
360370
/// time.
361371
MCRegister tryAssignCSRFirstTime(const LiveInterval &VirtReg,

llvm/test/CodeGen/AArch64/cgp-usubo.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -161,13 +161,13 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) nounwin
161161
; CHECK-NEXT: tbz w3, #0, .LBB8_3
162162
; CHECK-NEXT: // %bb.1: // %t
163163
; CHECK-NEXT: cmp x0, x1
164-
; CHECK-NEXT: mov x23, x0
164+
; CHECK-NEXT: mov x22, x0
165165
; CHECK-NEXT: mov x20, x2
166166
; CHECK-NEXT: cset w21, lo
167-
; CHECK-NEXT: mov x22, x1
167+
; CHECK-NEXT: mov x23, x1
168168
; CHECK-NEXT: mov w0, w21
169169
; CHECK-NEXT: bl call
170-
; CHECK-NEXT: subs x8, x23, x22
170+
; CHECK-NEXT: subs x8, x22, x23
171171
; CHECK-NEXT: b.hs .LBB8_3
172172
; CHECK-NEXT: // %bb.2: // %end
173173
; CHECK-NEXT: mov w19, w21

llvm/test/CodeGen/AArch64/csr-split.ll

Lines changed: 13 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -83,19 +83,18 @@ declare signext i32 @callNonVoid(ptr) local_unnamed_addr
8383
define dso_local signext i32 @test2(ptr %p1) local_unnamed_addr uwtable {
8484
; CHECK-LABEL: test2:
8585
; CHECK: // %bb.0: // %entry
86-
; CHECK-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
87-
; CHECK-NEXT: .cfi_def_cfa_offset 16
88-
; CHECK-NEXT: .cfi_offset w19, -8
89-
; CHECK-NEXT: .cfi_offset w30, -16
90-
; CHECK-NEXT: .cfi_remember_state
9186
; CHECK-NEXT: cbz x0, .LBB1_3
9287
; CHECK-NEXT: // %bb.1: // %entry
9388
; CHECK-NEXT: adrp x8, a
94-
; CHECK-NEXT: mov x19, x0
9589
; CHECK-NEXT: ldrsw x8, [x8, :lo12:a]
9690
; CHECK-NEXT: cmp x8, x0
9791
; CHECK-NEXT: b.ne .LBB1_3
9892
; CHECK-NEXT: // %bb.2: // %if.then2
93+
; CHECK-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
94+
; CHECK-NEXT: .cfi_def_cfa_offset 16
95+
; CHECK-NEXT: .cfi_offset w19, -8
96+
; CHECK-NEXT: .cfi_offset w30, -16
97+
; CHECK-NEXT: mov x19, x0
9998
; CHECK-NEXT: bl callVoid
10099
; CHECK-NEXT: mov x0, x19
101100
; CHECK-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload
@@ -104,34 +103,28 @@ define dso_local signext i32 @test2(ptr %p1) local_unnamed_addr uwtable {
104103
; CHECK-NEXT: .cfi_restore w30
105104
; CHECK-NEXT: b callNonVoid
106105
; CHECK-NEXT: .LBB1_3: // %return
107-
; CHECK-NEXT: .cfi_restore_state
108106
; CHECK-NEXT: mov w0, wzr
109-
; CHECK-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload
110-
; CHECK-NEXT: .cfi_def_cfa_offset 0
111-
; CHECK-NEXT: .cfi_restore w19
112-
; CHECK-NEXT: .cfi_restore w30
113107
; CHECK-NEXT: ret
114108
;
115109
; CHECK-APPLE-LABEL: test2:
116110
; CHECK-APPLE: ; %bb.0: ; %entry
117-
; CHECK-APPLE-NEXT: stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
118-
; CHECK-APPLE-NEXT: .cfi_def_cfa_offset 32
119-
; CHECK-APPLE-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill
120-
; CHECK-APPLE-NEXT: .cfi_offset w30, -8
121-
; CHECK-APPLE-NEXT: .cfi_offset w29, -16
122-
; CHECK-APPLE-NEXT: .cfi_offset w19, -24
123-
; CHECK-APPLE-NEXT: .cfi_offset w20, -32
124-
; CHECK-APPLE-NEXT: .cfi_remember_state
125111
; CHECK-APPLE-NEXT: cbz x0, LBB1_3
126112
; CHECK-APPLE-NEXT: ; %bb.1: ; %entry
127113
; CHECK-APPLE-NEXT: Lloh2:
128114
; CHECK-APPLE-NEXT: adrp x8, _a@PAGE
129-
; CHECK-APPLE-NEXT: mov x19, x0
130115
; CHECK-APPLE-NEXT: Lloh3:
131116
; CHECK-APPLE-NEXT: ldrsw x8, [x8, _a@PAGEOFF]
132117
; CHECK-APPLE-NEXT: cmp x8, x0
133118
; CHECK-APPLE-NEXT: b.ne LBB1_3
134119
; CHECK-APPLE-NEXT: ; %bb.2: ; %if.then2
120+
; CHECK-APPLE-NEXT: stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
121+
; CHECK-APPLE-NEXT: .cfi_def_cfa_offset 32
122+
; CHECK-APPLE-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill
123+
; CHECK-APPLE-NEXT: .cfi_offset w30, -8
124+
; CHECK-APPLE-NEXT: .cfi_offset w29, -16
125+
; CHECK-APPLE-NEXT: .cfi_offset w19, -24
126+
; CHECK-APPLE-NEXT: .cfi_offset w20, -32
127+
; CHECK-APPLE-NEXT: mov x19, x0
135128
; CHECK-APPLE-NEXT: bl _callVoid
136129
; CHECK-APPLE-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
137130
; CHECK-APPLE-NEXT: mov x0, x19
@@ -143,15 +136,7 @@ define dso_local signext i32 @test2(ptr %p1) local_unnamed_addr uwtable {
143136
; CHECK-APPLE-NEXT: .cfi_restore w20
144137
; CHECK-APPLE-NEXT: b _callNonVoid
145138
; CHECK-APPLE-NEXT: LBB1_3: ; %return
146-
; CHECK-APPLE-NEXT: .cfi_restore_state
147-
; CHECK-APPLE-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
148139
; CHECK-APPLE-NEXT: mov w0, wzr
149-
; CHECK-APPLE-NEXT: ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
150-
; CHECK-APPLE-NEXT: .cfi_def_cfa_offset 0
151-
; CHECK-APPLE-NEXT: .cfi_restore w30
152-
; CHECK-APPLE-NEXT: .cfi_restore w29
153-
; CHECK-APPLE-NEXT: .cfi_restore w19
154-
; CHECK-APPLE-NEXT: .cfi_restore w20
155140
; CHECK-APPLE-NEXT: ret
156141
; CHECK-APPLE-NEXT: .loh AdrpLdr Lloh2, Lloh3
157142
entry:

0 commit comments

Comments
 (0)