Skip to content

Commit 078fc68

Browse files
admitricigcbot
authored andcommitted
Enable loop sinking of loads
Enable loop sinking of loads when it's beneficial for register pressure reduction and rework CodeSinking pass CodeSinking pass rework: Make loop sinking multi-pass to ensure all the beneficial instructions are sinked Use Uniform and AA information Add options to force loop sinking for perf tuning Disable multi-level loop-sinking (across 2 loop bounds) Add naive load scheduling in local sink Make heuristic to decide if the instruction is beneficial to sink aware of the data size Change the code style to LLVM in the most parts of the changed functions
1 parent cb113bb commit 078fc68

File tree

9 files changed

+926
-234
lines changed

9 files changed

+926
-234
lines changed

IGC/Compiler/CISACodeGen/CodeSinking.cpp

Lines changed: 488 additions & 222 deletions
Large diffs are not rendered by default.

IGC/Compiler/CISACodeGen/CodeSinking.hpp

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,19 +14,21 @@ See LICENSE.TXT for details.
1414
============================= end_copyright_notice ===========================*/
1515

1616
#pragma once
17+
#include "Compiler/CISACodeGen/WIAnalysis.hpp"
1718
#include "common/LLVMWarningsPush.hpp"
1819
#include <llvm/Analysis/PostDominators.h>
1920
#include <llvm/Analysis/LoopInfo.h>
2021
#include "common/LLVMWarningsPop.hpp"
2122

2223
namespace IGC {
2324

24-
#define CODE_SINKING_MIN_SIZE 32
25-
2625
class CodeSinking : public llvm::FunctionPass {
2726
llvm::DominatorTree* DT;
2827
llvm::PostDominatorTree* PDT;
2928
llvm::LoopInfo* LI;
29+
llvm::AliasAnalysis* AA;
30+
WIAnalysis* WI;
31+
3032
const llvm::DataLayout* DL; // to estimate register pressure
3133
CodeGenContext* CTX;
3234
public:
@@ -38,13 +40,19 @@ namespace IGC {
3840

3941
virtual void getAnalysisUsage(llvm::AnalysisUsage& AU) const override {
4042
AU.setPreservesCFG();
43+
4144
AU.addRequired<llvm::DominatorTreeWrapperPass>();
4245
AU.addRequired<llvm::PostDominatorTreeWrapperPass>();
4346
AU.addRequired<llvm::LoopInfoWrapperPass>();
47+
AU.addRequired<llvm::AAResultsWrapperPass>();
48+
AU.addRequired<WIAnalysis>();
4449
AU.addRequired<CodeGenContextWrapper>();
50+
4551
AU.addPreserved<llvm::DominatorTreeWrapperPass>();
4652
AU.addPreserved<llvm::PostDominatorTreeWrapperPass>();
4753
AU.addPreserved<llvm::LoopInfoWrapperPass>();
54+
AU.addPreserved<llvm::AAResultsWrapperPass>();
55+
AU.addPreservedID(WIAnalysis::ID);
4856
}
4957
private:
5058
bool ProcessBlock(llvm::BasicBlock& blk);
@@ -61,12 +69,14 @@ namespace IGC {
6169
bool isSafeToMove(llvm::Instruction* inst,
6270
bool& reducePressure, bool& hasAliasConcern,
6371
llvm::SmallPtrSetImpl<llvm::Instruction*>& Stores);
72+
bool isSafeToLoopSinkLoad(llvm::Instruction* I, llvm::Loop* Loop, llvm::AliasAnalysis* AA);
73+
bool isAlwaysSinkInstruction(llvm::Instruction* I);
6474

6575
/// local processing
6676
bool LocalSink(llvm::BasicBlock* blk);
6777
/// data members for local-sinking
68-
llvm::SmallPtrSet<llvm::BasicBlock*, 8> localBlkSet;
69-
llvm::SmallPtrSet<llvm::Instruction*, 8> localInstSet;
78+
llvm::SmallPtrSet<llvm::BasicBlock*, 8> LocalBlkSet;
79+
llvm::SmallPtrSet<llvm::Instruction*, 8> LocalInstSet;
7080
/// data members for undo
7181
std::vector<llvm::Instruction*> movedInsts;
7282
std::vector<llvm::Instruction*> undoLocas;
@@ -88,6 +98,12 @@ namespace IGC {
8898
typedef std::pair<llvm::Instruction*, llvm::Instruction*> InstPair;
8999
typedef smallvector<llvm::Instruction*, 4> InstVec;
90100

101+
// memoize all possible stores for every loop that is a candidate for sinking
102+
typedef llvm::SmallVector<llvm::Instruction*, 32> StoresVec;
103+
llvm::DenseMap<llvm::Loop*, StoresVec> MemoizedStoresInLoops;
104+
llvm::SmallPtrSet<llvm::Loop*, 8> BlacklistedLoops;
105+
const StoresVec getAllStoresInLoop(llvm::Loop* L);
106+
91107
void appendIfNotExist(InstPair src, std::vector<InstPair> &instMap)
92108
{
93109
if (std::find(instMap.begin(), instMap.end(), src) == instMap.end())
@@ -123,12 +139,14 @@ namespace IGC {
123139
bool hoistCongruentPhi(llvm::Function& F);
124140

125141
llvm::Loop* findLoopAsPreheader(llvm::BasicBlock& blk);
126-
// move LI back into loops
127-
bool loopSink(llvm::Loop* LoopWithPressure, bool SinkMultipleLevel);
142+
// move LI back into loop
143+
bool loopSink(llvm::Loop* LoopWithPressure);
128144
// pre-condition to sink an instruction into a loop
129-
bool canLoopSink(llvm::Instruction* I, llvm::Loop* L);
130-
bool LoopSinkInstructions(
131-
llvm::SmallVector<llvm::Instruction*, 64> sinkCandidates, llvm::Loop* L);
145+
bool isLoopSinkCandidate(llvm::Instruction* I, llvm::Loop* L);
146+
bool loopSinkInstructions(
147+
llvm::SmallVector<llvm::Instruction*, 64>& SinkCandidates,
148+
llvm::SmallPtrSet<llvm::Instruction*, 32>& LoadChains,
149+
llvm::Loop* L);
132150

133151
// Move referencing DbgValueInst intrinsics calls after defining instructions
134152
void ProcessDbgValueInst(llvm::BasicBlock& blk);
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
;=========================== begin_copyright_notice ============================
2+
;
3+
; Copyright (C) 2023 Intel Corporation
4+
;
5+
; SPDX-License-Identifier: MIT
6+
;
7+
;============================ end_copyright_notice =============================
8+
9+
; REQUIRES: regkeys
10+
; RUN: igc_opt --regkey LoopSinkMinSave=1 --regkey LoopSinkMinSaveUniform=3 --regkey ForceLoopSink=1 --regkey CodeSinkingMinSize=10 %enable-basic-aa% --igc-wi-analysis --igc-code-sinking -S %s | FileCheck %s
11+
; We set LoopSinkMinSaveUniform=3, and check that only the case with saving 3 scalars is being sinked, when all the values are uniform
12+
define spir_kernel void @foo(float addrspace(1)* %in0, float addrspace(1)* %in1, float addrspace(1)* %out0, i32 %count, i16 %localIdX, i16 %localIdY, i16 %localIdZ) #0 {
13+
; CHECK-LABEL: @foo(
14+
; CHECK: entry:
15+
; CHECK: [[ADDR_1:%.*]] = getelementptr float, float addrspace(1)* [[IN0:%.*]], i32 0
16+
; CHECK: [[ADDR_2:%.*]] = getelementptr float, float addrspace(1)* [[IN1:%.*]], i32 0
17+
; CHECK: br label [[ENTRY_PREHEADER:%.*]]
18+
; CHECK: entry_preheader:
19+
; CHECK: [[L_1:%.*]] = load float, float addrspace(1)* [[ADDR_1]], align 16
20+
; CHECK: [[L_2:%.*]] = load float, float addrspace(1)* [[ADDR_2]], align 16
21+
; not sinked
22+
; CHECK: [[ADDF2_1:%.*]] = fadd float [[L_2]], 1.000000e+00
23+
; CHECK: [[ADDF2_2:%.*]] = fadd float [[L_2]], 2.000000e+00
24+
; CHECK: [[ADDF2_3:%.*]] = fadd float [[L_2]], 3.000000e+00
25+
; CHECK: loop:
26+
; sinked:
27+
; CHECK: [[ADDF1_1:%.*]] = fadd float [[L_1]], 1.000000e+00
28+
; CHECK: [[ADDF1_2:%.*]] = fadd float [[L_1]], 2.000000e+00
29+
; CHECK: [[ADDF1_3:%.*]] = fadd float [[L_1]], 3.000000e+00
30+
; CHECK: [[ADDF1_4:%.*]] = fadd float [[L_1]], 4.000000e+00
31+
; CHECK: afterloop:
32+
; CHECK: ret void
33+
;
34+
entry:
35+
%addr_1 = getelementptr float, float addrspace(1)* %in0, i32 0
36+
%addr_2 = getelementptr float, float addrspace(1)* %in1, i32 0
37+
br label %entry_preheader
38+
39+
entry_preheader: ; preds = %entry
40+
%l_1 = load float, float addrspace(1)* %addr_1, align 16
41+
%l_2 = load float, float addrspace(1)* %addr_2, align 16
42+
%addf1_1 = fadd float %l_1, 1.0
43+
%addf1_2 = fadd float %l_1, 2.0
44+
%addf1_3 = fadd float %l_1, 3.0
45+
%addf1_4 = fadd float %l_1, 4.0
46+
%addf2_1 = fadd float %l_2, 1.0
47+
%addf2_2 = fadd float %l_2, 2.0
48+
%addf2_3 = fadd float %l_2, 3.0
49+
br label %loop
50+
51+
loop: ; preds = %loop, %entry_preheader
52+
%index = phi i32 [ 0, %entry_preheader ], [ %inc, %loop ]
53+
%addff_1 = fadd float %addf1_1, %addf2_1
54+
%addff_2 = fadd float %addff_1, %addf1_2
55+
%addff_3 = fadd float %addff_2, %addf1_3
56+
%addff_4 = fadd float %addff_3, %addf1_4
57+
%addff_5 = fadd float %addff_4, %addf2_2
58+
%res = fadd float %addff_5, %addf2_3
59+
60+
%out0_shifted = getelementptr float, float addrspace(1)* %out0, i32 %index
61+
store float %res, float addrspace(1)* %out0_shifted, align 8
62+
%cmptmp = icmp ult i32 %index, %count
63+
%inc = add i32 %index, 1
64+
br i1 %cmptmp, label %loop, label %afterloop
65+
66+
afterloop: ; preds = %loop
67+
ret void
68+
}
69+
70+
!IGCMetadata = !{!2}
71+
!igc.functions = !{!13}
72+
73+
!2 = !{!"ModuleMD", !3}
74+
!3 = !{!"FuncMD", !4, !5}
75+
!4 = !{!"FuncMDMap[0]", void (float addrspace(1)*, float addrspace(1)*, float addrspace(1)*, i32, i16, i16, i16)* @foo}
76+
!5 = !{!"FuncMDValue[0]", !6, !7, !11, !12}
77+
!6 = !{!"localOffsets"}
78+
!7 = !{!"workGroupWalkOrder", !8, !9, !10}
79+
!8 = !{!"dim0", i32 0}
80+
!9 = !{!"dim1", i32 1}
81+
!10 = !{!"dim2", i32 2}
82+
!11 = !{!"funcArgs"}
83+
!12 = !{!"functionType", !"KernelFunction"}
84+
!13 = !{void (float addrspace(1)*, float addrspace(1)*, float addrspace(1)*, i32, i16, i16, i16)* @foo, !14}
85+
!14 = !{!15, !16}
86+
!15 = !{!"function_type", i32 0}
87+
!16 = !{!"implicit_arg_desc", !17, !18, !19, !20, !21}
88+
!17 = !{i32 0}
89+
!18 = !{i32 1}
90+
!19 = !{i32 7}
91+
!20 = !{i32 8}
92+
!21 = !{i32 9}
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
;=========================== begin_copyright_notice ============================
2+
;
3+
; Copyright (C) 2023 Intel Corporation
4+
;
5+
; SPDX-License-Identifier: MIT
6+
;
7+
;============================ end_copyright_notice =============================
8+
9+
; REQUIRES: regkeys
10+
; RUN: igc_opt --regkey LoopSinkMinSave=1 --regkey LoopSinkMinSaveUniform=10 --regkey ForceLoopSink=1 --regkey CodeSinkingMinSize=10 %enable-basic-aa% --igc-wi-analysis --igc-code-sinking -S %s | FileCheck %s
11+
; We set LoopSinkMinSaveUniform=10, but in this test uniform vs non-uniform results in sinking
12+
define spir_kernel void @foo(float addrspace(1)* %in0, float addrspace(1)* %in1, float addrspace(1)* %out0, i32 %count, i16 %localIdX, i16 %localIdY, i16 %localIdZ) #0 {
13+
; CHECK-LABEL: @foo(
14+
; CHECK-NEXT: entry:
15+
; CHECK-NEXT: [[LOCALIDX32:%.*]] = zext i16 [[LOCALIDX:%.*]] to i32
16+
; CHECK-NEXT: [[NON_UNIFORM_ADDR_1:%.*]] = getelementptr float, float addrspace(1)* [[IN0:%.*]], i32 [[LOCALIDX32]]
17+
; CHECK-NEXT: [[UNIFORM_ADDR_2:%.*]] = getelementptr float, float addrspace(1)* [[IN1:%.*]], i32 0
18+
19+
; CHECK: entry_preheader:
20+
; CHECK-NEXT: [[NON_UNIFORM_LOAD_1:%.*]] = load float, float addrspace(1)* [[NON_UNIFORM_ADDR_1]], align 16
21+
; CHECK-NEXT: [[UNIFORM_LOAD_2:%.*]] = load float, float addrspace(1)* [[UNIFORM_ADDR_2]], align 16
22+
23+
; CHECK: br label [[LOOP:%.*]]
24+
25+
; CHECK: loop:
26+
27+
; this ADDF_1 should be sinked, it becomes free as l_1 is already used in the loop, so we decrease the pressure
28+
; CHECK: [[ADDF_1:%.*]] = fadd float [[NON_UNIFORM_LOAD_1]], 1.000000e+00
29+
30+
; this ADDFF_1 should be sinked:
31+
; it's i32,i32->i32, but the only parameter that is not used in the loop, is uniform
32+
; and the fadd is not, so we remove register pressure by sinking it
33+
34+
; It wouldn't be sinked if didn't prove it's uniform
35+
; CHECK: [[ADDFF_1:%.*]] = fadd float [[ADDF_1]], [[UNIFORM_LOAD_2]]
36+
37+
; CHECK: afterloop:
38+
;
39+
entry:
40+
%localIdX32 = zext i16 %localIdX to i32
41+
%addr_1 = getelementptr float, float addrspace(1)* %in0, i32 %localIdX32
42+
%addr_2 = getelementptr float, float addrspace(1)* %in1, i32 0
43+
br label %entry_preheader
44+
45+
entry_preheader: ; preds = %entry
46+
%l_1 = load float, float addrspace(1)* %addr_1, align 16
47+
%l_2 = load float, float addrspace(1)* %addr_2, align 16
48+
%addf_1 = fadd float %l_1, 1.0
49+
%addff_1 = fadd float %addf_1, %l_2
50+
br label %loop
51+
52+
loop: ; preds = %loop, %entry_preheader
53+
%index = phi i32 [ 0, %entry_preheader ], [ %inc, %loop ]
54+
%addf_2 = fadd float %l_1, 2.0
55+
%acc0 = fadd float %addf_1, %addf_2
56+
%acc1 = fadd float %addf_2, %addff_1
57+
58+
%out0_shifted = getelementptr float, float addrspace(1)* %out0, i32 %index
59+
store float %acc1, float addrspace(1)* %out0_shifted, align 8
60+
%cmptmp = icmp ult i32 %index, %count
61+
%inc = add i32 %index, 1
62+
br i1 %cmptmp, label %loop, label %afterloop
63+
64+
afterloop: ; preds = %loop
65+
ret void
66+
}
67+
68+
!IGCMetadata = !{!2}
69+
!igc.functions = !{!13}
70+
71+
!2 = !{!"ModuleMD", !3}
72+
!3 = !{!"FuncMD", !4, !5}
73+
!4 = !{!"FuncMDMap[0]", void (float addrspace(1)*, float addrspace(1)*, float addrspace(1)*, i32, i16, i16, i16)* @foo}
74+
!5 = !{!"FuncMDValue[0]", !6, !7, !11, !12}
75+
!6 = !{!"localOffsets"}
76+
!7 = !{!"workGroupWalkOrder", !8, !9, !10}
77+
!8 = !{!"dim0", i32 0}
78+
!9 = !{!"dim1", i32 1}
79+
!10 = !{!"dim2", i32 2}
80+
!11 = !{!"funcArgs"}
81+
!12 = !{!"functionType", !"KernelFunction"}
82+
!13 = !{void (float addrspace(1)*, float addrspace(1)*, float addrspace(1)*, i32, i16, i16, i16)* @foo, !14}
83+
!14 = !{!15, !16}
84+
!15 = !{!"function_type", i32 0}
85+
!16 = !{!"implicit_arg_desc", !17, !18, !19, !20, !21}
86+
!17 = !{i32 0}
87+
!18 = !{i32 1}
88+
!19 = !{i32 7}
89+
!20 = !{i32 8}
90+
!21 = !{i32 9}
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
;=========================== begin_copyright_notice ============================
2+
;
3+
; Copyright (C) 2023 Intel Corporation
4+
;
5+
; SPDX-License-Identifier: MIT
6+
;
7+
;============================ end_copyright_notice =============================
8+
9+
; REQUIRES: regkeys
10+
; RUN: igc_opt --regkey LoopSinkMinSave=4 --regkey ForceLoopSink=1 --regkey CodeSinkingMinSize=10 %enable-basic-aa% --igc-code-sinking -S %s | FileCheck %s
11+
define void @foo(float addrspace(1)* %in0, double addrspace(1)* %in1, float addrspace(1)* noalias %out0, i32 %count, i32 %offsetIn0, i32 %offsetIn2) {
12+
; CHECK-LABEL: @foo(
13+
; CHECK: entry:
14+
; CHECK: [[ADDR_1:%.*]] = getelementptr float, float addrspace(1)* [[IN0:%.*]], i32 0
15+
; CHECK: [[ADDR_3:%.*]] = getelementptr float, float addrspace(1)* [[IN0]], i32 2
16+
; CHECK: br label [[ENTRY_PREHEADER:%.*]]
17+
; CHECK: entry_preheader:
18+
; CHECK: [[L_1:%.*]] = load float, float addrspace(1)* [[ADDR_1]], align 16
19+
; CHECK: [[L_3:%.*]] = load float, float addrspace(1)* [[ADDR_3]], align 16
20+
21+
; this add is not beneficial to sink
22+
23+
; CHECK: [[ADDFF_1:%.*]] = fadd float [[L_3]], 1.000000e+00
24+
; CHECK: br label [[LOOP:%.*]]
25+
; CHECK: loop:
26+
27+
; These 5 adds are beneficial to sink at once, because now only one value is alive in the loop (L_1), instead of 5
28+
29+
; CHECK: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY_PREHEADER]] ], [ [[INC:%.*]], [[LOOP]] ]
30+
; CHECK: [[ADDF_2:%.*]] = fadd float [[L_1]], 2.000000e+00
31+
; CHECK: [[ADDF_1:%.*]] = fadd float [[L_1]], 1.000000e+00
32+
; CHECK: [[ACC0:%.*]] = fadd float [[ADDF_1]], [[ADDF_2]]
33+
; CHECK: [[ADDF_3:%.*]] = fadd float [[L_1]], 3.000000e+00
34+
; CHECK: [[ACC1:%.*]] = fadd float [[ACC0]], [[ADDF_3]]
35+
; CHECK: [[ADDF_4:%.*]] = fadd float [[L_1]], 4.000000e+00
36+
; CHECK: [[ACC2:%.*]] = fadd float [[ACC1]], [[ADDF_4]]
37+
; CHECK: [[ADDF_5:%.*]] = fadd float [[L_1]], 5.000000e+00
38+
; CHECK: [[ACC3:%.*]] = fadd float [[ACC2]], [[ADDF_5]]
39+
; CHECK: [[ACC4:%.*]] = fadd float [[ACC2]], [[ADDFF_1]]
40+
; CHECK: [[OUT0_SHIFTED:%.*]] = getelementptr float, float addrspace(1)* [[OUT0:%.*]], i32 [[INDEX]]
41+
; CHECK: store float [[ACC4]], float addrspace(1)* [[OUT0_SHIFTED]], align 8
42+
; CHECK: [[INC]] = add i32 [[INDEX]], 1
43+
; CHECK: [[CMPTMP:%.*]] = icmp ult i32 [[INDEX]], [[COUNT:%.*]]
44+
; CHECK: br i1 [[CMPTMP]], label [[LOOP]], label [[AFTERLOOP:%.*]]
45+
; CHECK: afterloop:
46+
; CHECK: ret void
47+
;
48+
entry:
49+
%addr_1 = getelementptr float, float addrspace(1)* %in0, i32 0
50+
%addr_3 = getelementptr float, float addrspace(1)* %in0, i32 2
51+
52+
%l_1 = load float, float addrspace(1)* %addr_1, align 16
53+
%l_3 = load float, float addrspace(1)* %addr_3, align 16
54+
55+
br label %entry_preheader
56+
57+
entry_preheader: ; preds = %entry
58+
%addf_1 = fadd float %l_1, 1.0
59+
%addf_2 = fadd float %l_1, 2.0
60+
%addf_3 = fadd float %l_1, 3.0
61+
%addf_4 = fadd float %l_1, 4.0
62+
%addf_5 = fadd float %l_1, 5.0
63+
64+
%addff_1 = fadd float %l_3, 1.0
65+
66+
br label %loop
67+
68+
loop: ; preds = %loop, %entry_preheader
69+
%index = phi i32 [ 0, %entry_preheader ], [ %inc, %loop ]
70+
%acc0 = fadd float %addf_1, %addf_2
71+
%acc1 = fadd float %acc0, %addf_3
72+
%acc2 = fadd float %acc1, %addf_4
73+
%acc3 = fadd float %acc2, %addf_5
74+
75+
%acc4 = fadd float %acc2, %addff_1
76+
77+
%out0_shifted = getelementptr float, float addrspace(1)* %out0, i32 %index
78+
store float %acc4, float addrspace(1)* %out0_shifted, align 8
79+
%cmptmp = icmp ult i32 %index, %count
80+
%inc = add i32 %index, 1
81+
br i1 %cmptmp, label %loop, label %afterloop
82+
83+
afterloop: ; preds = %loop
84+
ret void
85+
}
86+
87+
!igc.functions = !{}

0 commit comments

Comments
 (0)