Skip to content

Commit b9f96f9

Browse files
author
anikelal
committed
[Clang] Emit stub version of OpenCL Kernel
OpenCL allows a kernel function to call another kernel function. To facilitate this we emit a stub version of each kernel function with different name mangling scheme, and replace the kernel callsite appropriately. #60313 https://ontrack-internal.amd.com/browse/SWDEV-245936
1 parent d1e17a3 commit b9f96f9

13 files changed

+163
-21
lines changed

clang/include/clang/AST/GlobalDecl.h

+30-11
Original file line numberDiff line numberDiff line change
@@ -71,14 +71,19 @@ class GlobalDecl {
7171
GlobalDecl(const FunctionDecl *D, unsigned MVIndex = 0)
7272
: MultiVersionIndex(MVIndex) {
7373
if (!D->hasAttr<CUDAGlobalAttr>()) {
74+
if (D->hasAttr<OpenCLKernelAttr>()) {
75+
Value.setPointerAndInt(D, unsigned(KernelReferenceKind::Kernel));
76+
return;
77+
}
7478
Init(D);
7579
return;
7680
}
7781
Value.setPointerAndInt(D, unsigned(getDefaultKernelReference(D)));
7882
}
7983
GlobalDecl(const FunctionDecl *D, KernelReferenceKind Kind)
8084
: Value(D, unsigned(Kind)) {
81-
assert(D->hasAttr<CUDAGlobalAttr>() && "Decl is not a GPU kernel!");
85+
assert((D->hasAttr<CUDAGlobalAttr>() && "Decl is not a GPU kernel!") ||
86+
(D->hasAttr<OpenCLKernelAttr>() && "Decl is not a OpenCL kernel!"));
8287
}
8388
GlobalDecl(const NamedDecl *D) { Init(D); }
8489
GlobalDecl(const BlockDecl *D) { Init(D); }
@@ -130,13 +135,20 @@ class GlobalDecl {
130135
}
131136

132137
KernelReferenceKind getKernelReferenceKind() const {
133-
assert(((isa<FunctionDecl>(getDecl()) &&
134-
cast<FunctionDecl>(getDecl())->hasAttr<CUDAGlobalAttr>()) ||
135-
(isa<FunctionTemplateDecl>(getDecl()) &&
136-
cast<FunctionTemplateDecl>(getDecl())
137-
->getTemplatedDecl()
138-
->hasAttr<CUDAGlobalAttr>())) &&
139-
"Decl is not a GPU kernel!");
138+
assert((((isa<FunctionDecl>(getDecl()) &&
139+
cast<FunctionDecl>(getDecl())->hasAttr<CUDAGlobalAttr>()) ||
140+
(isa<FunctionTemplateDecl>(getDecl()) &&
141+
cast<FunctionTemplateDecl>(getDecl())
142+
->getTemplatedDecl()
143+
->hasAttr<CUDAGlobalAttr>())) &&
144+
"Decl is not a GPU kernel!") ||
145+
(((isa<FunctionDecl>(getDecl()) &&
146+
cast<FunctionDecl>(getDecl())->hasAttr<OpenCLKernelAttr>()) ||
147+
(isa<FunctionTemplateDecl>(getDecl()) &&
148+
cast<FunctionTemplateDecl>(getDecl())
149+
->getTemplatedDecl()
150+
->hasAttr<OpenCLKernelAttr>())) &&
151+
"Decl is not a OpenCL kernel!"));
140152
return static_cast<KernelReferenceKind>(Value.getInt());
141153
}
142154

@@ -196,9 +208,16 @@ class GlobalDecl {
196208
}
197209

198210
GlobalDecl getWithKernelReferenceKind(KernelReferenceKind Kind) {
199-
assert(isa<FunctionDecl>(getDecl()) &&
200-
cast<FunctionDecl>(getDecl())->hasAttr<CUDAGlobalAttr>() &&
201-
"Decl is not a GPU kernel!");
211+
assert((isa<FunctionDecl>(getDecl()) &&
212+
cast<FunctionDecl>(getDecl())->hasAttr<CUDAGlobalAttr>() &&
213+
"Decl is not a GPU kernel!") ||
214+
(((isa<FunctionDecl>(getDecl()) &&
215+
cast<FunctionDecl>(getDecl())->hasAttr<OpenCLKernelAttr>()) ||
216+
(isa<FunctionTemplateDecl>(getDecl()) &&
217+
cast<FunctionTemplateDecl>(getDecl())
218+
->getTemplatedDecl()
219+
->hasAttr<OpenCLKernelAttr>())) &&
220+
"Decl is not a OpenCL kernel!"));
202221
GlobalDecl Result(*this);
203222
Result.Value.setInt(unsigned(Kind));
204223
return Result;

clang/include/clang/AST/Type.h

+5
Original file line numberDiff line numberDiff line change
@@ -4668,6 +4668,11 @@ class FunctionType : public Type {
46684668
CallingConv getCallConv() const { return getExtInfo().getCC(); }
46694669
ExtInfo getExtInfo() const { return ExtInfo(FunctionTypeBits.ExtInfo); }
46704670

4671+
void setCC(unsigned cc) {
4672+
FunctionTypeBits.ExtInfo =
4673+
(FunctionTypeBits.ExtInfo & ~ExtInfo::CallConvMask) | cc;
4674+
}
4675+
46714676
static_assert((~Qualifiers::FastMask & Qualifiers::CVRMask) == 0,
46724677
"Const, volatile and restrict are assumed to be a subset of "
46734678
"the fast qualifiers.");

clang/lib/AST/Expr.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -694,7 +694,8 @@ std::string PredefinedExpr::ComputeName(PredefinedIdentKind IK,
694694
GD = GlobalDecl(CD, Ctor_Base);
695695
else if (const CXXDestructorDecl *DD = dyn_cast<CXXDestructorDecl>(ND))
696696
GD = GlobalDecl(DD, Dtor_Base);
697-
else if (ND->hasAttr<CUDAGlobalAttr>())
697+
else if (ND->hasAttr<CUDAGlobalAttr>() ||
698+
ND->hasAttr<OpenCLKernelAttr>())
698699
GD = GlobalDecl(cast<FunctionDecl>(ND));
699700
else
700701
GD = GlobalDecl(ND);

clang/lib/AST/ItaniumMangle.cpp

+14
Original file line numberDiff line numberDiff line change
@@ -528,6 +528,7 @@ class CXXNameMangler {
528528
void mangleSourceName(const IdentifierInfo *II);
529529
void mangleRegCallName(const IdentifierInfo *II);
530530
void mangleDeviceStubName(const IdentifierInfo *II);
531+
void mangleOCLDeviceStubName(const IdentifierInfo *II);
531532
void mangleSourceNameWithAbiTags(
532533
const NamedDecl *ND, const AbiTagList *AdditionalAbiTags = nullptr);
533534
void mangleLocalName(GlobalDecl GD,
@@ -1563,8 +1564,13 @@ void CXXNameMangler::mangleUnqualifiedName(
15631564
bool IsDeviceStub =
15641565
FD && FD->hasAttr<CUDAGlobalAttr>() &&
15651566
GD.getKernelReferenceKind() == KernelReferenceKind::Stub;
1567+
bool IsOCLDeviceStub =
1568+
FD && FD->hasAttr<OpenCLKernelAttr>() &&
1569+
GD.getKernelReferenceKind() == KernelReferenceKind::Stub;
15661570
if (IsDeviceStub)
15671571
mangleDeviceStubName(II);
1572+
else if (IsOCLDeviceStub)
1573+
mangleOCLDeviceStubName(II);
15681574
else if (IsRegCall)
15691575
mangleRegCallName(II);
15701576
else
@@ -1782,6 +1788,14 @@ void CXXNameMangler::mangleDeviceStubName(const IdentifierInfo *II) {
17821788
<< II->getName();
17831789
}
17841790

1791+
void CXXNameMangler::mangleOCLDeviceStubName(const IdentifierInfo *II) {
1792+
// <source-name> ::= <positive length number> __clang_ocl_kern_imp_
1793+
// <identifier> <number> ::= [n] <non-negative decimal integer> <identifier>
1794+
// ::= <unqualified source code identifier>
1795+
Out << II->getLength() + sizeof("__clang_ocl_kern_imp_") - 1
1796+
<< "__clang_ocl_kern_imp_" << II->getName();
1797+
}
1798+
17851799
void CXXNameMangler::mangleSourceName(const IdentifierInfo *II) {
17861800
// <source-name> ::= <positive length number> <identifier>
17871801
// <number> ::= [n] <non-negative decimal integer>

clang/lib/AST/Mangle.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -540,7 +540,7 @@ class ASTNameGenerator::Implementation {
540540
GD = GlobalDecl(CtorD, Ctor_Complete);
541541
else if (const auto *DtorD = dyn_cast<CXXDestructorDecl>(D))
542542
GD = GlobalDecl(DtorD, Dtor_Complete);
543-
else if (D->hasAttr<CUDAGlobalAttr>())
543+
else if (D->hasAttr<CUDAGlobalAttr>() || D->hasAttr<OpenCLKernelAttr>())
544544
GD = GlobalDecl(cast<FunctionDecl>(D));
545545
else
546546
GD = GlobalDecl(D);

clang/lib/AST/MicrosoftMangle.cpp

+11
Original file line numberDiff line numberDiff line change
@@ -1163,9 +1163,20 @@ void MicrosoftCXXNameMangler::mangleUnqualifiedName(GlobalDecl GD,
11631163
->getTemplatedDecl()
11641164
->hasAttr<CUDAGlobalAttr>())) &&
11651165
GD.getKernelReferenceKind() == KernelReferenceKind::Stub;
1166+
bool IsOCLDeviceStub =
1167+
ND &&
1168+
((isa<FunctionDecl>(ND) && ND->hasAttr<OpenCLKernelAttr>()) ||
1169+
(isa<FunctionTemplateDecl>(ND) &&
1170+
cast<FunctionTemplateDecl>(ND)
1171+
->getTemplatedDecl()
1172+
->hasAttr<OpenCLKernelAttr>())) &&
1173+
GD.getKernelReferenceKind() == KernelReferenceKind::Stub;
11661174
if (IsDeviceStub)
11671175
mangleSourceName(
11681176
(llvm::Twine("__device_stub__") + II->getName()).str());
1177+
else if (IsOCLDeviceStub)
1178+
mangleSourceName(
1179+
(llvm::Twine("__clang_ocl_kern_imp_") + II->getName()).str());
11691180
else
11701181
mangleSourceName(II->getName());
11711182
break;

clang/lib/CodeGen/CGCall.cpp

+25-4
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,16 @@ static void setCUDAKernelCallingConvention(CanQualType &FTy, CodeGenModule &CGM,
300300
}
301301
}
302302

303+
static void setOCLKernelStubCallingConvention(CanQualType &FTy,
304+
CodeGenModule &CGM,
305+
const FunctionDecl *FD) {
306+
if (FD->hasAttr<OpenCLKernelAttr>()) {
307+
const FunctionType *FT = FTy->getAs<FunctionType>();
308+
CGM.getTargetCodeGenInfo().setOCLKernelStubCallingConvention(FT);
309+
FTy = FT->getCanonicalTypeUnqualified();
310+
}
311+
}
312+
303313
/// Arrange the argument and result information for a declaration or
304314
/// definition of the given C++ non-static member function. The
305315
/// member function must be an ordinary function, i.e. not a
@@ -460,15 +470,19 @@ CodeGenTypes::arrangeCXXConstructorCall(const CallArgList &args,
460470
/// Arrange the argument and result information for the declaration or
461471
/// definition of the given function.
462472
const CGFunctionInfo &
463-
CodeGenTypes::arrangeFunctionDeclaration(const FunctionDecl *FD) {
473+
CodeGenTypes::arrangeFunctionDeclaration(const FunctionDecl *FD,
474+
CanQualType *FTy_ptr /* = nullptr*/) {
464475
if (const CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(FD))
465476
if (MD->isImplicitObjectMemberFunction())
466477
return arrangeCXXMethodDeclaration(MD);
467478

468-
CanQualType FTy = FD->getType()->getCanonicalTypeUnqualified();
479+
CanQualType FTy = FTy_ptr == nullptr
480+
? FD->getType()->getCanonicalTypeUnqualified()
481+
: *FTy_ptr;
469482

470483
assert(isa<FunctionType>(FTy));
471-
setCUDAKernelCallingConvention(FTy, CGM, FD);
484+
if (!FD->getLangOpts().OpenCL)
485+
setCUDAKernelCallingConvention(FTy, CGM, FD);
472486

473487
// When declaring a function without a prototype, always use a
474488
// non-variadic type.
@@ -548,7 +562,14 @@ CodeGenTypes::arrangeGlobalDeclaration(GlobalDecl GD) {
548562
isa<CXXDestructorDecl>(GD.getDecl()))
549563
return arrangeCXXStructorDeclaration(GD);
550564

551-
return arrangeFunctionDeclaration(FD);
565+
CanQualType FTy = FD->getType()->getCanonicalTypeUnqualified();
566+
if (FD->hasAttr<OpenCLKernelAttr>() &&
567+
GD.getKernelReferenceKind() ==
568+
KernelReferenceKind::
569+
Stub) { // OCLKernelReferenceKind::ClangOCLKernelImpl){
570+
setOCLKernelStubCallingConvention(FTy, CGM, FD);
571+
}
572+
return arrangeFunctionDeclaration(FD, &FTy);
552573
}
553574

554575
/// Arrange a thunk that takes 'this' as the first parameter followed by

clang/lib/CodeGen/CGExpr.cpp

+43
Original file line numberDiff line numberDiff line change
@@ -5607,6 +5607,43 @@ RValue CodeGenFunction::EmitCallExpr(const CallExpr *E,
56075607
return EmitCXXPseudoDestructorExpr(callee.getPseudoDestructorExpr());
56085608
}
56095609

5610+
// Change calling convention of callee function at callsite
5611+
5612+
const Expr *calleeExpr = E->getCallee()->IgnoreParens();
5613+
5614+
while (auto ICE = dyn_cast<ImplicitCastExpr>(calleeExpr)) {
5615+
if (ICE->getCastKind() == CK_FunctionToPointerDecay ||
5616+
ICE->getCastKind() == CK_BuiltinFnToFnPtr)
5617+
calleeExpr = ICE->getSubExpr()->IgnoreParens();
5618+
else
5619+
break;
5620+
}
5621+
5622+
if (auto DRE = dyn_cast<DeclRefExpr>(calleeExpr)) {
5623+
if (auto FD = dyn_cast<FunctionDecl>(DRE->getDecl())) {
5624+
if (FD->hasAttr<OpenCLKernelAttr>() && !FD->getBuiltinID()) {
5625+
const FunctionType *ft =
5626+
cast<FunctionType>(cast<PointerType>(getContext().getCanonicalType(
5627+
E->getCallee()->getType()))
5628+
->getPointeeType());
5629+
FunctionType *ftnc = const_cast<FunctionType *>(ft);
5630+
ftnc->setCC(CC_C);
5631+
}
5632+
}
5633+
}
5634+
if (auto ME = dyn_cast<MemberExpr>(calleeExpr)) {
5635+
if (auto FD = dyn_cast<FunctionDecl>(ME->getMemberDecl())) {
5636+
if (FD->hasAttr<OpenCLKernelAttr>() && !FD->getBuiltinID()) {
5637+
const FunctionType *ft =
5638+
cast<FunctionType>(cast<PointerType>(getContext().getCanonicalType(
5639+
E->getCallee()->getType()))
5640+
->getPointeeType());
5641+
FunctionType *ftnc = const_cast<FunctionType *>(ft);
5642+
ftnc->setCC(CC_C);
5643+
}
5644+
}
5645+
}
5646+
56105647
return EmitCall(E->getCallee()->getType(), callee, E, ReturnValue,
56115648
/*Chain=*/nullptr, CallOrInvoke);
56125649
}
@@ -5695,11 +5732,17 @@ CGCallee CodeGenFunction::EmitCallee(const Expr *E) {
56955732
// Resolve direct calls.
56965733
} else if (auto DRE = dyn_cast<DeclRefExpr>(E)) {
56975734
if (auto FD = dyn_cast<FunctionDecl>(DRE->getDecl())) {
5735+
if (FD->hasAttr<OpenCLKernelAttr>())
5736+
return EmitDirectCallee(*this,
5737+
GlobalDecl(FD, KernelReferenceKind::Stub));
56985738
return EmitDirectCallee(*this, FD);
56995739
}
57005740
} else if (auto ME = dyn_cast<MemberExpr>(E)) {
57015741
if (auto FD = dyn_cast<FunctionDecl>(ME->getMemberDecl())) {
57025742
EmitIgnoredExpr(ME->getBase());
5743+
if (FD->hasAttr<OpenCLKernelAttr>())
5744+
return EmitDirectCallee(*this,
5745+
GlobalDecl(FD, KernelReferenceKind::Stub));
57035746
return EmitDirectCallee(*this, FD);
57045747
}
57055748

clang/lib/CodeGen/CGOpenCLRuntime.cpp

+4-1
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,10 @@ static const BlockExpr *getBlockExpr(const Expr *E) {
127127
void CGOpenCLRuntime::recordBlockInfo(const BlockExpr *E,
128128
llvm::Function *InvokeF,
129129
llvm::Value *Block, llvm::Type *BlockTy) {
130-
assert(!EnqueuedBlockMap.contains(E) && "Block expression emitted twice");
130+
131+
// Since OpenCL Kernels are emitted twice (kernel version and device version),
132+
// its constituent BlockExpr will also be emitted twice
133+
// assert(!EnqueuedBlockMap.contains(E) && "Block expression emitted twice");
131134
assert(isa<llvm::Function>(InvokeF) && "Invalid invoke function");
132135
assert(Block->getType()->isPointerTy() && "Invalid block literal type");
133136
EnqueuedBlockMap[E].InvokeFunc = InvokeF;

clang/lib/CodeGen/CodeGenModule.cpp

+15-2
Original file line numberDiff line numberDiff line change
@@ -1888,6 +1888,9 @@ static std::string getMangledNameImpl(CodeGenModule &CGM, GlobalDecl GD,
18881888
} else if (FD && FD->hasAttr<CUDAGlobalAttr>() &&
18891889
GD.getKernelReferenceKind() == KernelReferenceKind::Stub) {
18901890
Out << "__device_stub__" << II->getName();
1891+
} else if (FD && FD->hasAttr<OpenCLKernelAttr>() &&
1892+
GD.getKernelReferenceKind() == KernelReferenceKind::Stub) {
1893+
Out << "__clang_ocl_kern_imp_" << II->getName();
18911894
} else {
18921895
Out << II->getName();
18931896
}
@@ -3283,8 +3286,14 @@ void CodeGenModule::EmitDeferred() {
32833286
if (LangOpts.OpenMP && OpenMPRuntime && OpenMPRuntime->emitTargetGlobal(D))
32843287
continue;
32853288

3286-
// Otherwise, emit the definition and move on to the next one.
3287-
EmitGlobalDefinition(D, GV);
3289+
// Otherwise, emit the definition and move on to the next one,
3290+
// Donot emit definition for a device version of OpenCL kernel that does not
3291+
// have a body
3292+
if (!(isa<FunctionDecl>(D.getDecl()) &&
3293+
(cast<FunctionDecl>(D.getDecl()))->hasAttr<OpenCLKernelAttr>() &&
3294+
D.getKernelReferenceKind() == KernelReferenceKind::Stub &&
3295+
!((cast<FunctionDecl>(D.getDecl()))->doesThisDeclarationHaveABody())))
3296+
EmitGlobalDefinition(D, GV);
32883297

32893298
// If we found out that we need to emit more decls, do that recursively.
32903299
// This has the advantage that the decls are emitted in a DFS and related
@@ -3842,6 +3851,10 @@ void CodeGenModule::EmitGlobal(GlobalDecl GD) {
38423851

38433852
// Ignore declarations, they will be emitted on their first use.
38443853
if (const auto *FD = dyn_cast<FunctionDecl>(Global)) {
3854+
3855+
if (FD->hasAttr<OpenCLKernelAttr>())
3856+
addDeferredDeclToEmit(GlobalDecl(FD, KernelReferenceKind::Stub));
3857+
38453858
// Update deferred annotations with the latest declaration if the function
38463859
// function was already used or defined.
38473860
if (FD->hasAttr<AnnotateAttr>()) {

clang/lib/CodeGen/CodeGenTypes.h

+3-1
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,9 @@ class CodeGenTypes {
207207

208208
/// Free functions are functions that are compatible with an ordinary
209209
/// C function pointer type.
210-
const CGFunctionInfo &arrangeFunctionDeclaration(const FunctionDecl *FD);
210+
const CGFunctionInfo &
211+
arrangeFunctionDeclaration(const FunctionDecl *FD,
212+
CanQualType *FTy_ptr = nullptr);
211213
const CGFunctionInfo &arrangeFreeFunctionCall(const CallArgList &Args,
212214
const FunctionType *Ty,
213215
bool ChainCall);

clang/lib/CodeGen/TargetInfo.h

+2
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,8 @@ class TargetCodeGenInfo {
382382
virtual bool shouldEmitDWARFBitFieldSeparators() const { return false; }
383383

384384
virtual void setCUDAKernelCallingConvention(const FunctionType *&FT) const {}
385+
virtual void
386+
setOCLKernelStubCallingConvention(const FunctionType *&FT) const {}
385387

386388
/// Return the device-side type for the CUDA device builtin surface type.
387389
virtual llvm::Type *getCUDADeviceBuiltinSurfaceDeviceType() const {

clang/lib/CodeGen/Targets/AMDGPU.cpp

+8
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,8 @@ class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
321321
bool shouldEmitStaticExternCAliases() const override;
322322
bool shouldEmitDWARFBitFieldSeparators() const override;
323323
void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;
324+
void
325+
setOCLKernelStubCallingConvention(const FunctionType *&FT) const override;
324326
};
325327
}
326328

@@ -598,6 +600,12 @@ void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention(
598600
FT, FT->getExtInfo().withCallingConv(CC_OpenCLKernel));
599601
}
600602

603+
void AMDGPUTargetCodeGenInfo::setOCLKernelStubCallingConvention(
604+
const FunctionType *&FT) const {
605+
FT = getABIInfo().getContext().adjustFunctionType(
606+
FT, FT->getExtInfo().withCallingConv(CC_C));
607+
}
608+
601609
/// Create an OpenCL kernel for an enqueued block.
602610
///
603611
/// The type of the first argument (the block literal) is the struct type

0 commit comments

Comments
 (0)