Skip to content

Commit 888a20c

Browse files
authored
AMDGPU: Drop amdgpu-no-lds-kernel-id attribute in LDS lowering (#71481)
This is in preparation for moving the run of AMDGPUAttributor earlier. Currently it infers the lack of the corresponding intrinsic calls, so if we introduce new ones we need to remove the attribute from any possible transitive callers. This is more conservative than necessary, we could try to identify specific subgraphs where LDS globals are not used. Other options include teaching the attributor to avoid adding it in cases where the lowering may choose the table, but this seems more complex. Alternatively could add a second run which doesn't seem worth it. Depends #71349
1 parent 5cfe24e commit 888a20c

File tree

2 files changed

+262
-0
lines changed

2 files changed

+262
-0
lines changed

llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp

+52
Original file line numberDiff line numberDiff line change
@@ -1026,6 +1026,51 @@ class AMDGPULowerModuleLDS {
10261026
return N;
10271027
}
10281028

1029+
/// Strip "amdgpu-no-lds-kernel-id" from any functions where we may have
1030+
/// introduced its use. If AMDGPUAttributor ran prior to the pass, we inferred
1031+
/// the lack of llvm.amdgcn.lds.kernel.id calls.
1032+
void removeNoLdsKernelIdFromReachable(CallGraph &CG, Function *KernelRoot) {
1033+
KernelRoot->removeFnAttr("amdgpu-no-lds-kernel-id");
1034+
1035+
SmallVector<Function *> Tmp({CG[KernelRoot]->getFunction()});
1036+
if (!Tmp.back())
1037+
return;
1038+
1039+
SmallPtrSet<Function *, 8> Visited;
1040+
bool SeenUnknownCall = false;
1041+
1042+
do {
1043+
Function *F = Tmp.pop_back_val();
1044+
1045+
for (auto &N : *CG[F]) {
1046+
if (!N.second)
1047+
continue;
1048+
1049+
Function *Callee = N.second->getFunction();
1050+
if (!Callee) {
1051+
if (!SeenUnknownCall) {
1052+
SeenUnknownCall = true;
1053+
1054+
// If we see any indirect calls, assume nothing about potential
1055+
// targets.
1056+
// TODO: This could be refined to possible LDS global users.
1057+
for (auto &N : *CG.getExternalCallingNode()) {
1058+
Function *PotentialCallee = N.second->getFunction();
1059+
if (!isKernelLDS(PotentialCallee))
1060+
PotentialCallee->removeFnAttr("amdgpu-no-lds-kernel-id");
1061+
}
1062+
1063+
continue;
1064+
}
1065+
}
1066+
1067+
Callee->removeFnAttr("amdgpu-no-lds-kernel-id");
1068+
if (Visited.insert(Callee).second)
1069+
Tmp.push_back(Callee);
1070+
}
1071+
} while (!Tmp.empty());
1072+
}
1073+
10291074
DenseMap<Function *, GlobalVariable *> lowerDynamicLDSVariables(
10301075
Module &M, LDSUsesInfoTy &LDSUsesInfo,
10311076
DenseSet<Function *> const &KernelsThatIndirectlyAllocateDynamicLDS,
@@ -1175,6 +1220,13 @@ class AMDGPULowerModuleLDS {
11751220
M, TableLookupVariablesOrdered, OrderedKernels, KernelToReplacement);
11761221
replaceUsesInInstructionsWithTableLookup(M, TableLookupVariablesOrdered,
11771222
LookupTable);
1223+
1224+
// Strip amdgpu-no-lds-kernel-id from all functions reachable from the
1225+
// kernel. We may have inferred this wasn't used prior to the pass.
1226+
//
1227+
// TODO: We could filter out subgraphs that do not access LDS globals.
1228+
for (Function *F : KernelsThatAllocateTableLDS)
1229+
removeNoLdsKernelIdFromReachable(CG, F);
11781230
}
11791231

11801232
DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS =
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals --version 3
2+
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-attributor,amdgpu-lower-module-lds < %s --amdgpu-lower-module-lds-strategy=table | FileCheck -check-prefixes=CHECK,TABLE %s
3+
4+
; FIXME: Work around update_test_checks bug in constant expression handling by manually deleting part of the last global pattern
5+
6+
@function.lds = addrspace(3) global i16 poison
7+
@other.kernel.lds = addrspace(3) global i16 poison
8+
@recursive.kernel.lds = addrspace(3) global i16 poison
9+
10+
;.
11+
; CHECK: @[[LLVM_AMDGCN_KERNEL_K0_F0_LDS:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [[LLVM_AMDGCN_KERNEL_K0_F0_LDS_T:%.*]] poison, align 2, !absolute_symbol !0
12+
; CHECK: @[[LLVM_AMDGCN_KERNEL_K1_F0_LDS:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [[LLVM_AMDGCN_KERNEL_K1_F0_LDS_T:%.*]] poison, align 2, !absolute_symbol !0
13+
; CHECK: @[[LLVM_AMDGCN_KERNEL_KERNEL_LDS_LDS:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [[LLVM_AMDGCN_KERNEL_KERNEL_LDS_LDS_T:%.*]] poison, align 2, !absolute_symbol !0
14+
; CHECK: @[[LLVM_AMDGCN_KERNEL_KERNEL_LDS_RECURSION_LDS:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [[LLVM_AMDGCN_KERNEL_KERNEL_LDS_RECURSION_LDS_T:%.*]] poison, align 2, !absolute_symbol !0
15+
; CHECK: @[[LLVM_AMDGCN_LDS_OFFSET_TABLE:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(4) constant [3 x [2 x i32]]
16+
;.
17+
define internal void @lds_use_through_indirect() {
18+
; CHECK-LABEL: define internal void @lds_use_through_indirect(
19+
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
20+
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
21+
; CHECK-NEXT: [[FUNCTION_LDS2:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
22+
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[FUNCTION_LDS2]], align 4
23+
; CHECK-NEXT: [[FUNCTION_LDS3:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
24+
; CHECK-NEXT: [[LD:%.*]] = load i16, ptr addrspace(3) [[FUNCTION_LDS3]], align 2
25+
; CHECK-NEXT: [[MUL:%.*]] = mul i16 [[LD]], 7
26+
; CHECK-NEXT: [[FUNCTION_LDS:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
27+
; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[FUNCTION_LDS]], align 4
28+
; CHECK-NEXT: [[FUNCTION_LDS1:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
29+
; CHECK-NEXT: store i16 [[MUL]], ptr addrspace(3) [[FUNCTION_LDS1]], align 2
30+
; CHECK-NEXT: ret void
31+
;
32+
%ld = load i16, ptr addrspace(3) @function.lds
33+
%mul = mul i16 %ld, 7
34+
store i16 %mul, ptr addrspace(3) @function.lds
35+
ret void
36+
}
37+
38+
define internal void @indirectly_called() {
39+
; CHECK-LABEL: define internal void @indirectly_called(
40+
; CHECK-SAME: ) #[[ATTR0]] {
41+
; CHECK-NEXT: store volatile ptr @indirectly_called, ptr addrspace(1) null, align 8
42+
; CHECK-NEXT: call void @lds_use_through_indirect()
43+
; CHECK-NEXT: ret void
44+
;
45+
store volatile ptr @indirectly_called, ptr addrspace(1) null
46+
call void @lds_use_through_indirect()
47+
ret void
48+
}
49+
50+
define internal void @calls_indirectly_called() {
51+
; CHECK-LABEL: define internal void @calls_indirectly_called(
52+
; CHECK-SAME: ) #[[ATTR0]] {
53+
; CHECK-NEXT: call void @indirectly_called()
54+
; CHECK-NEXT: ret void
55+
;
56+
call void @indirectly_called()
57+
ret void
58+
}
59+
60+
; TODO: Should still have "amdgpu-no-lds-kernel-id" attached
61+
define internal void @no_lds_global_use_leaf() {
62+
; CHECK-LABEL: define internal void @no_lds_global_use_leaf(
63+
; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
64+
; CHECK-NEXT: ret void
65+
;
66+
ret void
67+
}
68+
69+
; Should have "amdgpu-no-lds-kernel-id" stripped
70+
define internal void @f0() {
71+
; CHECK-LABEL: define internal void @f0(
72+
; CHECK-SAME: ) #[[ATTR0]] {
73+
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
74+
; CHECK-NEXT: [[FUNCTION_LDS2:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
75+
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[FUNCTION_LDS2]], align 4
76+
; CHECK-NEXT: [[FUNCTION_LDS3:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
77+
; CHECK-NEXT: [[LD:%.*]] = load i16, ptr addrspace(3) [[FUNCTION_LDS3]], align 2
78+
; CHECK-NEXT: [[MUL:%.*]] = mul i16 [[LD]], 4
79+
; CHECK-NEXT: [[FUNCTION_LDS:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
80+
; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[FUNCTION_LDS]], align 4
81+
; CHECK-NEXT: [[FUNCTION_LDS1:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
82+
; CHECK-NEXT: store i16 [[MUL]], ptr addrspace(3) [[FUNCTION_LDS1]], align 2
83+
; CHECK-NEXT: call void @no_lds_global_use_leaf()
84+
; CHECK-NEXT: ret void
85+
;
86+
%ld = load i16, ptr addrspace(3) @function.lds
87+
%mul = mul i16 %ld, 4
88+
store i16 %mul, ptr addrspace(3) @function.lds
89+
call void @no_lds_global_use_leaf()
90+
ret void
91+
}
92+
93+
; Should have "amdgpu-no-lds-kernel-id" stripped
94+
define internal void @f0_transitive() {
95+
; CHECK-LABEL: define internal void @f0_transitive(
96+
; CHECK-SAME: ) #[[ATTR0]] {
97+
; CHECK-NEXT: call void @f0()
98+
; CHECK-NEXT: call void @no_lds_global_use_leaf()
99+
; CHECK-NEXT: ret void
100+
;
101+
call void @f0()
102+
call void @no_lds_global_use_leaf()
103+
ret void
104+
}
105+
106+
define amdgpu_kernel void @k0_f0() {
107+
; CHECK-LABEL: define amdgpu_kernel void @k0_f0(
108+
; CHECK-SAME: ) #[[ATTR2:[0-9]+]] !llvm.amdgcn.lds.kernel.id !1 {
109+
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k0_f0.lds) ]
110+
; CHECK-NEXT: call void @f0_transitive()
111+
; CHECK-NEXT: ret void
112+
;
113+
call void @f0_transitive()
114+
ret void
115+
}
116+
117+
define amdgpu_kernel void @k1_f0() {
118+
; CHECK-LABEL: define amdgpu_kernel void @k1_f0(
119+
; CHECK-SAME: ) #[[ATTR3:[0-9]+]] !llvm.amdgcn.lds.kernel.id !2 {
120+
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k1_f0.lds) ], !alias.scope !3, !noalias !6
121+
; CHECK-NEXT: call void @f0_transitive()
122+
; CHECK-NEXT: [[FPTR:%.*]] = load volatile ptr, ptr addrspace(1) null, align 8
123+
; CHECK-NEXT: call void [[FPTR]]()
124+
; CHECK-NEXT: call void @calls_indirectly_called()
125+
; CHECK-NEXT: ret void
126+
;
127+
call void @f0_transitive()
128+
%fptr = load volatile ptr, ptr addrspace(1) null
129+
call void %fptr()
130+
call void @calls_indirectly_called()
131+
ret void
132+
}
133+
134+
; Should still have "amdgpu-no-lds-kernel-id" attached
135+
define amdgpu_kernel void @kernel_lds() {
136+
; CHECK-LABEL: define amdgpu_kernel void @kernel_lds(
137+
; CHECK-SAME: ) #[[ATTR4:[0-9]+]] {
138+
; CHECK-NEXT: [[LD:%.*]] = load i16, ptr addrspace(3) @llvm.amdgcn.kernel.kernel_lds.lds, align 2
139+
; CHECK-NEXT: [[MUL:%.*]] = mul i16 [[LD]], 42
140+
; CHECK-NEXT: store i16 [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.kernel_lds.lds, align 2
141+
; CHECK-NEXT: ret void
142+
;
143+
%ld = load i16, ptr addrspace(3) @other.kernel.lds
144+
%mul = mul i16 %ld, 42
145+
store i16 %mul, ptr addrspace(3) @other.kernel.lds
146+
ret void
147+
}
148+
149+
define internal i16 @mutual_recursion_0(i16 %arg) {
150+
; CHECK-LABEL: define internal i16 @mutual_recursion_0(
151+
; CHECK-SAME: i16 [[ARG:%.*]]) #[[ATTR0]] {
152+
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
153+
; CHECK-NEXT: [[RECURSIVE_KERNEL_LDS:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
154+
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[RECURSIVE_KERNEL_LDS]], align 4
155+
; CHECK-NEXT: [[RECURSIVE_KERNEL_LDS1:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
156+
; CHECK-NEXT: [[LD:%.*]] = load i16, ptr addrspace(3) [[RECURSIVE_KERNEL_LDS1]], align 2
157+
; CHECK-NEXT: [[MUL:%.*]] = mul i16 [[LD]], 7
158+
; CHECK-NEXT: [[RET:%.*]] = call i16 @mutual_recursion_1(i16 [[LD]])
159+
; CHECK-NEXT: [[ADD:%.*]] = add i16 [[RET]], 1
160+
; CHECK-NEXT: ret i16 [[ADD]]
161+
;
162+
%ld = load i16, ptr addrspace(3) @recursive.kernel.lds
163+
%mul = mul i16 %ld, 7
164+
%ret = call i16 @mutual_recursion_1(i16 %ld)
165+
%add = add i16 %ret, 1
166+
ret i16 %add
167+
}
168+
169+
define internal void @mutual_recursion_1(i16 %arg) {
170+
; CHECK-LABEL: define internal void @mutual_recursion_1(
171+
; CHECK-SAME: i16 [[ARG:%.*]]) #[[ATTR0]] {
172+
; CHECK-NEXT: call void @mutual_recursion_0(i16 [[ARG]])
173+
; CHECK-NEXT: ret void
174+
;
175+
call void @mutual_recursion_0(i16 %arg)
176+
ret void
177+
}
178+
179+
define amdgpu_kernel void @kernel_lds_recursion() {
180+
; CHECK-LABEL: define amdgpu_kernel void @kernel_lds_recursion(
181+
; CHECK-SAME: ) #[[ATTR2]] !llvm.amdgcn.lds.kernel.id !8 {
182+
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kernel_lds_recursion.lds) ]
183+
; CHECK-NEXT: call void @mutual_recursion_0(i16 0)
184+
; CHECK-NEXT: ret void
185+
;
186+
call void @mutual_recursion_0(i16 0)
187+
ret void
188+
}
189+
190+
;.
191+
; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
192+
; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
193+
; CHECK: attributes #[[ATTR2]] = { "amdgpu-lds-size"="2" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
194+
; CHECK: attributes #[[ATTR3]] = { "amdgpu-lds-size"="4" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
195+
; CHECK: attributes #[[ATTR4]] = { "amdgpu-lds-size"="2" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
196+
; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
197+
; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
198+
;.
199+
; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 1}
200+
; CHECK: [[META1:![0-9]+]] = !{i32 0}
201+
; CHECK: [[META2:![0-9]+]] = !{i32 1}
202+
; CHECK: [[META3:![0-9]+]] = !{!4}
203+
; CHECK: [[META4:![0-9]+]] = distinct !{!4, !5}
204+
; CHECK: [[META5:![0-9]+]] = distinct !{!5}
205+
; CHECK: [[META6:![0-9]+]] = !{!7}
206+
; CHECK: [[META7:![0-9]+]] = distinct !{!7, !5}
207+
; CHECK: [[META8:![0-9]+]] = !{i32 2}
208+
;.
209+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
210+
; TABLE: {{.*}}

0 commit comments

Comments
 (0)