Skip to content

Commit 992b451

Browse files
authored
[Utils][UnifyLoopExits] Avoid costly updates if nothing changed (#129179)
If the ControlFlowHub did not perform any change to the control flow, there is no need to repair SSA, update the loop structure, and verify a bunch of things. This is not completely NFC though, repairSSA introduced PHI nodes with a single entry that are now missing. My code went from 400+ seconds to 1 second, since no loop required the exits to be unified, but there were many "complex" loops.
1 parent 926600a commit 992b451

File tree

6 files changed

+33
-23
lines changed

6 files changed

+33
-23
lines changed

llvm/include/llvm/Transforms/Utils/ControlFlowUtils.h

+3-1
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,9 @@ struct ControlFlowHub {
110110
Branches.emplace_back(BB, Succ0, Succ1);
111111
}
112112

113-
BasicBlock *
113+
/// Return the unified loop exit block and a flag indicating if the CFG was
114+
/// changed at all.
115+
std::pair<BasicBlock *, bool>
114116
finalize(DomTreeUpdater *DTU, SmallVectorImpl<BasicBlock *> &GuardBlocks,
115117
const StringRef Prefix,
116118
std::optional<unsigned> MaxControlFlowBooleans = std::nullopt);

llvm/lib/Transforms/Utils/ControlFlowUtils.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -270,7 +270,7 @@ static void reconnectPhis(BasicBlock *Out, BasicBlock *GuardBlock,
270270
}
271271
}
272272

273-
BasicBlock *ControlFlowHub::finalize(
273+
std::pair<BasicBlock *, bool> ControlFlowHub::finalize(
274274
DomTreeUpdater *DTU, SmallVectorImpl<BasicBlock *> &GuardBlocks,
275275
const StringRef Prefix, std::optional<unsigned> MaxControlFlowBooleans) {
276276
#ifndef NDEBUG
@@ -289,7 +289,7 @@ BasicBlock *ControlFlowHub::finalize(
289289
}
290290

291291
if (Outgoing.size() < 2)
292-
return Outgoing.front();
292+
return {Outgoing.front(), false};
293293

294294
SmallVector<DominatorTree::UpdateType, 16> Updates;
295295
if (DTU) {
@@ -338,5 +338,5 @@ BasicBlock *ControlFlowHub::finalize(
338338
Inst->eraseFromParent();
339339
}
340340

341-
return FirstGuardBlock;
341+
return {FirstGuardBlock, true};
342342
}

llvm/lib/Transforms/Utils/UnifyLoopExits.cpp

+5-1
Original file line numberDiff line numberDiff line change
@@ -169,8 +169,12 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
169169

170170
SmallVector<BasicBlock *, 8> GuardBlocks;
171171
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
172-
BasicBlock *LoopExitBlock = CHub.finalize(
172+
BasicBlock *LoopExitBlock;
173+
bool ChangedCFG;
174+
std::tie(LoopExitBlock, ChangedCFG) = CHub.finalize(
173175
&DTU, GuardBlocks, "loop.exit", MaxBooleansInControlFlowHub.getValue());
176+
if (!ChangedCFG)
177+
return false;
174178

175179
restoreSSA(DT, L, ExitingBlocks, LoopExitBlock);
176180

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll

+4-2
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,7 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
298298
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
299299
; GFX10-NEXT: s_mov_b32 s5, 0
300300
; GFX10-NEXT: ; implicit-def: $sgpr6
301-
; GFX10-NEXT: v_mov_b32_e32 v4, s5
301+
; GFX10-NEXT: v_mov_b32_e32 v5, s5
302302
; GFX10-NEXT: s_branch .LBB4_2
303303
; GFX10-NEXT: .LBB4_1: ; %Flow
304304
; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1
@@ -312,6 +312,7 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
312312
; GFX10-NEXT: s_cbranch_execz .LBB4_6
313313
; GFX10-NEXT: .LBB4_2: ; %cond.block.0
314314
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
315+
; GFX10-NEXT: v_mov_b32_e32 v4, v5
315316
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
316317
; GFX10-NEXT: s_and_saveexec_b32 s7, vcc_lo
317318
; GFX10-NEXT: s_cbranch_execz .LBB4_4
@@ -328,11 +329,12 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
328329
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7
329330
; GFX10-NEXT: v_cmp_ne_u32_e64 s4, v1, v4
330331
; GFX10-NEXT: s_mov_b32 s7, -1
332+
; GFX10-NEXT: ; implicit-def: $vgpr5
331333
; GFX10-NEXT: s_and_saveexec_b32 s8, s4
332334
; GFX10-NEXT: s_cbranch_execz .LBB4_1
333335
; GFX10-NEXT: ; %bb.5: ; %loop.cond
334336
; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1
335-
; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v4
337+
; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v4
336338
; GFX10-NEXT: s_andn2_b32 s4, -1, exec_lo
337339
; GFX10-NEXT: s_and_b32 s7, exec_lo, 0
338340
; GFX10-NEXT: s_or_b32 s7, s4, s7

llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll

+16-16
Original file line numberDiff line numberDiff line change
@@ -7646,9 +7646,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
76467646
; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9]
76477647
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
76487648
; GFX7-NEXT: s_cbranch_execnz .LBB28_2
7649-
; GFX7-NEXT: ; %bb.3: ; %Flow23
7649+
; GFX7-NEXT: ; %bb.3: ; %Flow22
76507650
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
7651-
; GFX7-NEXT: .LBB28_4: ; %Flow24
7651+
; GFX7-NEXT: .LBB28_4: ; %Flow23
76527652
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
76537653
; GFX7-NEXT: s_mov_b64 s[8:9], exec
76547654
; GFX7-NEXT: v_readfirstlane_b32 s10, v1
@@ -7676,7 +7676,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
76767676
; GFX7-NEXT: v_mov_b32_e32 v3, v4
76777677
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
76787678
; GFX7-NEXT: s_cbranch_execnz .LBB28_6
7679-
; GFX7-NEXT: .LBB28_7: ; %Flow22
7679+
; GFX7-NEXT: .LBB28_7: ; %Flow21
76807680
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
76817681
; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
76827682
; GFX7-NEXT: v_mul_f32_e32 v0, 0x42280000, v0
@@ -7725,7 +7725,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
77257725
; GFX7-NEXT: s_cbranch_execnz .LBB28_11
77267726
; GFX7-NEXT: ; %bb.12: ; %Flow
77277727
; GFX7-NEXT: s_or_b64 exec, exec, s[2:3]
7728-
; GFX7-NEXT: .LBB28_13: ; %Flow20
7728+
; GFX7-NEXT: .LBB28_13: ; %Flow19
77297729
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
77307730
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
77317731
; GFX7-NEXT: v_readfirstlane_b32 s4, v2
@@ -7770,9 +7770,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
77707770
; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9]
77717771
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
77727772
; GFX6-NEXT: s_cbranch_execnz .LBB28_2
7773-
; GFX6-NEXT: ; %bb.3: ; %Flow21
7773+
; GFX6-NEXT: ; %bb.3: ; %Flow20
77747774
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
7775-
; GFX6-NEXT: .LBB28_4: ; %Flow22
7775+
; GFX6-NEXT: .LBB28_4: ; %Flow21
77767776
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
77777777
; GFX6-NEXT: s_mov_b64 s[8:9], exec
77787778
; GFX6-NEXT: v_readfirstlane_b32 s10, v1
@@ -7800,7 +7800,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
78007800
; GFX6-NEXT: v_mov_b32_e32 v3, v4
78017801
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
78027802
; GFX6-NEXT: s_cbranch_execnz .LBB28_6
7803-
; GFX6-NEXT: .LBB28_7: ; %Flow20
7803+
; GFX6-NEXT: .LBB28_7: ; %Flow19
78047804
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
78057805
; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
78067806
; GFX6-NEXT: v_mul_f32_e32 v0, 0x42280000, v0
@@ -7849,7 +7849,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
78497849
; GFX6-NEXT: s_cbranch_execnz .LBB28_11
78507850
; GFX6-NEXT: ; %bb.12: ; %Flow
78517851
; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
7852-
; GFX6-NEXT: .LBB28_13: ; %Flow18
7852+
; GFX6-NEXT: .LBB28_13: ; %Flow17
78537853
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
78547854
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
78557855
; GFX6-NEXT: v_readfirstlane_b32 s4, v2
@@ -8483,9 +8483,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
84838483
; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9]
84848484
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
84858485
; GFX7-NEXT: s_cbranch_execnz .LBB29_2
8486-
; GFX7-NEXT: ; %bb.3: ; %Flow23
8486+
; GFX7-NEXT: ; %bb.3: ; %Flow22
84878487
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
8488-
; GFX7-NEXT: .LBB29_4: ; %Flow24
8488+
; GFX7-NEXT: .LBB29_4: ; %Flow23
84898489
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
84908490
; GFX7-NEXT: s_mov_b64 s[8:9], exec
84918491
; GFX7-NEXT: v_readfirstlane_b32 s10, v1
@@ -8513,7 +8513,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
85138513
; GFX7-NEXT: v_mov_b32_e32 v3, v4
85148514
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
85158515
; GFX7-NEXT: s_cbranch_execnz .LBB29_6
8516-
; GFX7-NEXT: .LBB29_7: ; %Flow22
8516+
; GFX7-NEXT: .LBB29_7: ; %Flow21
85178517
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
85188518
; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
85198519
; GFX7-NEXT: v_mul_f32_e32 v0, 0x42280000, v0
@@ -8562,7 +8562,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
85628562
; GFX7-NEXT: s_cbranch_execnz .LBB29_11
85638563
; GFX7-NEXT: ; %bb.12: ; %Flow
85648564
; GFX7-NEXT: s_or_b64 exec, exec, s[2:3]
8565-
; GFX7-NEXT: .LBB29_13: ; %Flow20
8565+
; GFX7-NEXT: .LBB29_13: ; %Flow19
85668566
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
85678567
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
85688568
; GFX7-NEXT: v_readfirstlane_b32 s4, v2
@@ -8607,9 +8607,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
86078607
; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9]
86088608
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
86098609
; GFX6-NEXT: s_cbranch_execnz .LBB29_2
8610-
; GFX6-NEXT: ; %bb.3: ; %Flow21
8610+
; GFX6-NEXT: ; %bb.3: ; %Flow20
86118611
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
8612-
; GFX6-NEXT: .LBB29_4: ; %Flow22
8612+
; GFX6-NEXT: .LBB29_4: ; %Flow21
86138613
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
86148614
; GFX6-NEXT: s_mov_b64 s[8:9], exec
86158615
; GFX6-NEXT: v_readfirstlane_b32 s10, v1
@@ -8637,7 +8637,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
86378637
; GFX6-NEXT: v_mov_b32_e32 v3, v4
86388638
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
86398639
; GFX6-NEXT: s_cbranch_execnz .LBB29_6
8640-
; GFX6-NEXT: .LBB29_7: ; %Flow20
8640+
; GFX6-NEXT: .LBB29_7: ; %Flow19
86418641
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
86428642
; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
86438643
; GFX6-NEXT: v_mul_f32_e32 v0, 0x42280000, v0
@@ -8686,7 +8686,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
86868686
; GFX6-NEXT: s_cbranch_execnz .LBB29_11
86878687
; GFX6-NEXT: ; %bb.12: ; %Flow
86888688
; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
8689-
; GFX6-NEXT: .LBB29_13: ; %Flow18
8689+
; GFX6-NEXT: .LBB29_13: ; %Flow17
86908690
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
86918691
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
86928692
; GFX6-NEXT: v_readfirstlane_b32 s4, v2

llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll

+2
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
3939
; GFX942-NEXT: s_cbranch_vccz .LBB0_1
4040
; GFX942-NEXT: ; %bb.3:
4141
; GFX942-NEXT: ; implicit-def: $sgpr3
42+
; GFX942-NEXT: ; implicit-def: $agpr0
4243
; GFX942-NEXT: .LBB0_4: ; %common.ret
4344
; GFX942-NEXT: s_endpgm
4445
;
@@ -79,6 +80,7 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
7980
; GFX908-NEXT: s_cbranch_vccz .LBB0_1
8081
; GFX908-NEXT: ; %bb.3:
8182
; GFX908-NEXT: ; implicit-def: $sgpr3
83+
; GFX908-NEXT: ; implicit-def: $agpr0
8284
; GFX908-NEXT: .LBB0_4: ; %common.ret
8385
; GFX908-NEXT: s_endpgm
8486
entry:

0 commit comments

Comments
 (0)