Skip to content

[Clang][OpenCL][AMDGPU] Allow a kernel to call another kernel #115821

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions clang/include/clang/AST/Decl.h
Original file line number Diff line number Diff line change
Expand Up @@ -3048,6 +3048,8 @@ class FunctionDecl : public DeclaratorDecl,
static FunctionDecl *castFromDeclContext(const DeclContext *DC) {
return static_cast<FunctionDecl *>(const_cast<DeclContext*>(DC));
}

bool isReferenceableKernel() const;
};

/// Represents a member of a struct/union/class.
Expand Down
18 changes: 10 additions & 8 deletions clang/include/clang/AST/GlobalDecl.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,15 @@ class GlobalDecl {
GlobalDecl(const VarDecl *D) { Init(D);}
GlobalDecl(const FunctionDecl *D, unsigned MVIndex = 0)
: MultiVersionIndex(MVIndex) {
if (!D->hasAttr<CUDAGlobalAttr>()) {
Init(D);
if (D->isReferenceableKernel()) {
Value.setPointerAndInt(D, unsigned(getDefaultKernelReference(D)));
return;
}
Value.setPointerAndInt(D, unsigned(getDefaultKernelReference(D)));
Init(D);
}
GlobalDecl(const FunctionDecl *D, KernelReferenceKind Kind)
: Value(D, unsigned(Kind)) {
assert(D->hasAttr<CUDAGlobalAttr>() && "Decl is not a GPU kernel!");
assert(D->isReferenceableKernel() && "Decl is not a GPU kernel!");
}
GlobalDecl(const NamedDecl *D) { Init(D); }
GlobalDecl(const BlockDecl *D) { Init(D); }
Expand Down Expand Up @@ -131,12 +131,13 @@ class GlobalDecl {

KernelReferenceKind getKernelReferenceKind() const {
assert(((isa<FunctionDecl>(getDecl()) &&
cast<FunctionDecl>(getDecl())->hasAttr<CUDAGlobalAttr>()) ||
cast<FunctionDecl>(getDecl())->isReferenceableKernel()) ||
(isa<FunctionTemplateDecl>(getDecl()) &&
cast<FunctionTemplateDecl>(getDecl())
->getTemplatedDecl()
->hasAttr<CUDAGlobalAttr>())) &&
"Decl is not a GPU kernel!");

return static_cast<KernelReferenceKind>(Value.getInt());
}

Expand All @@ -160,8 +161,9 @@ class GlobalDecl {
}

static KernelReferenceKind getDefaultKernelReference(const FunctionDecl *D) {
return D->getLangOpts().CUDAIsDevice ? KernelReferenceKind::Kernel
: KernelReferenceKind::Stub;
return (D->hasAttr<OpenCLKernelAttr>() || D->getLangOpts().CUDAIsDevice)
? KernelReferenceKind::Kernel
: KernelReferenceKind::Stub;
}

GlobalDecl getWithDecl(const Decl *D) {
Expand Down Expand Up @@ -197,7 +199,7 @@ class GlobalDecl {

GlobalDecl getWithKernelReferenceKind(KernelReferenceKind Kind) {
assert(isa<FunctionDecl>(getDecl()) &&
cast<FunctionDecl>(getDecl())->hasAttr<CUDAGlobalAttr>() &&
cast<FunctionDecl>(getDecl())->isReferenceableKernel() &&
"Decl is not a GPU kernel!");
GlobalDecl Result(*this);
Result.Value.setInt(unsigned(Kind));
Expand Down
4 changes: 4 additions & 0 deletions clang/lib/AST/Decl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5468,6 +5468,10 @@ FunctionDecl *FunctionDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) {
/*TrailingRequiresClause=*/{});
}

bool FunctionDecl::isReferenceableKernel() const {
return hasAttr<CUDAGlobalAttr>() || hasAttr<OpenCLKernelAttr>();
}

BlockDecl *BlockDecl::Create(ASTContext &C, DeclContext *DC, SourceLocation L) {
return new (C, DC) BlockDecl(DC, L);
}
Expand Down
6 changes: 3 additions & 3 deletions clang/lib/AST/Expr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -695,9 +695,9 @@ std::string PredefinedExpr::ComputeName(PredefinedIdentKind IK,
GD = GlobalDecl(CD, Ctor_Base);
else if (const CXXDestructorDecl *DD = dyn_cast<CXXDestructorDecl>(ND))
GD = GlobalDecl(DD, Dtor_Base);
else if (ND->hasAttr<CUDAGlobalAttr>())
GD = GlobalDecl(cast<FunctionDecl>(ND));
else
else if (auto FD = dyn_cast<FunctionDecl>(ND)) {
GD = FD->isReferenceableKernel() ? GlobalDecl(FD) : GlobalDecl(ND);
} else
GD = GlobalDecl(ND);
MC->mangleName(GD, Out);

Expand Down
15 changes: 15 additions & 0 deletions clang/lib/AST/ItaniumMangle.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -526,6 +526,7 @@ class CXXNameMangler {
void mangleSourceName(const IdentifierInfo *II);
void mangleRegCallName(const IdentifierInfo *II);
void mangleDeviceStubName(const IdentifierInfo *II);
void mangleOCLDeviceStubName(const IdentifierInfo *II);
void mangleSourceNameWithAbiTags(
const NamedDecl *ND, const AbiTagList *AdditionalAbiTags = nullptr);
void mangleLocalName(GlobalDecl GD,
Expand Down Expand Up @@ -1561,8 +1562,13 @@ void CXXNameMangler::mangleUnqualifiedName(
bool IsDeviceStub =
FD && FD->hasAttr<CUDAGlobalAttr>() &&
GD.getKernelReferenceKind() == KernelReferenceKind::Stub;
bool IsOCLDeviceStub =
FD && FD->hasAttr<OpenCLKernelAttr>() &&
GD.getKernelReferenceKind() == KernelReferenceKind::Stub;
if (IsDeviceStub)
mangleDeviceStubName(II);
else if (IsOCLDeviceStub)
mangleOCLDeviceStubName(II);
else if (IsRegCall)
mangleRegCallName(II);
else
Expand Down Expand Up @@ -1780,6 +1786,15 @@ void CXXNameMangler::mangleDeviceStubName(const IdentifierInfo *II) {
<< II->getName();
}

void CXXNameMangler::mangleOCLDeviceStubName(const IdentifierInfo *II) {
// <source-name> ::= <positive length number> __clang_ocl_kern_imp_
// <identifier> <number> ::= [n] <non-negative decimal integer> <identifier>
// ::= <unqualified source code identifier>
StringRef OCLDeviceStubNamePrefix = "__clang_ocl_kern_imp_";
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd suggest using some symbols that a regular user-defined function can't be named to avoid conflict.

Copy link
Contributor Author

@lalaniket8 lalaniket8 Dec 2, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Names starting with double underscore are reserved, and shouldn't conflict with user defined names.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but you still can't prevent people from doing it right?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, it's undefined behavior if they do, the same that you can technically define a function in C called _Z3foov with a completely different signature from what C++ would expect. This sort of thing is generally fine as long as we make some effort to avoid unintentional conflicts.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In terms of how the symbols should behave, this might be backwards. Alternatively we could preserve the as-is name as the callable implicit function, and add some mangling to the kernel entry point. However, I bet there's a lot more code expecting the current kernel name used directly as kernel symbol behavior though, so maybe changing that is best left for later

Out << II->getLength() + OCLDeviceStubNamePrefix.size()
<< OCLDeviceStubNamePrefix << II->getName();
}

void CXXNameMangler::mangleSourceName(const IdentifierInfo *II) {
// <source-name> ::= <positive length number> <identifier>
// <number> ::= [n] <non-negative decimal integer>
Expand Down
6 changes: 3 additions & 3 deletions clang/lib/AST/Mangle.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -540,9 +540,9 @@ class ASTNameGenerator::Implementation {
GD = GlobalDecl(CtorD, Ctor_Complete);
else if (const auto *DtorD = dyn_cast<CXXDestructorDecl>(D))
GD = GlobalDecl(DtorD, Dtor_Complete);
else if (D->hasAttr<CUDAGlobalAttr>())
GD = GlobalDecl(cast<FunctionDecl>(D));
else
else if (const FunctionDecl *FD = dyn_cast<FunctionDecl>(D)) {
GD = FD->isReferenceableKernel() ? GlobalDecl(FD) : GlobalDecl(D);
} else
GD = GlobalDecl(D);
MC->mangleName(GD, OS);
return false;
Expand Down
6 changes: 6 additions & 0 deletions clang/lib/AST/MicrosoftMangle.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1162,9 +1162,15 @@ void MicrosoftCXXNameMangler::mangleUnqualifiedName(GlobalDecl GD,
->getTemplatedDecl()
->hasAttr<CUDAGlobalAttr>())) &&
GD.getKernelReferenceKind() == KernelReferenceKind::Stub;
bool IsOCLDeviceStub =
ND && isa<FunctionDecl>(ND) && ND->hasAttr<OpenCLKernelAttr>() &&
GD.getKernelReferenceKind() == KernelReferenceKind::Stub;
if (IsDeviceStub)
mangleSourceName(
(llvm::Twine("__device_stub__") + II->getName()).str());
else if (IsOCLDeviceStub)
mangleSourceName(
(llvm::Twine("__clang_ocl_kern_imp_") + II->getName()).str());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm surprised mangleSourceName doesn't take a twine already. Probably should fix that later

else
mangleSourceName(II->getName());
break;
Expand Down
21 changes: 15 additions & 6 deletions clang/lib/CodeGen/CGCall.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -499,7 +499,8 @@ CodeGenTypes::arrangeCXXConstructorCall(const CallArgList &args,
/// Arrange the argument and result information for the declaration or
/// definition of the given function.
const CGFunctionInfo &
CodeGenTypes::arrangeFunctionDeclaration(const FunctionDecl *FD) {
CodeGenTypes::arrangeFunctionDeclaration(const GlobalDecl GD) {
const FunctionDecl *FD = cast<FunctionDecl>(GD.getDecl());
if (const CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(FD))
if (MD->isImplicitObjectMemberFunction())
return arrangeCXXMethodDeclaration(MD);
Expand All @@ -509,6 +510,13 @@ CodeGenTypes::arrangeFunctionDeclaration(const FunctionDecl *FD) {
assert(isa<FunctionType>(FTy));
setCUDAKernelCallingConvention(FTy, CGM, FD);

if (FD->hasAttr<OpenCLKernelAttr>() &&
GD.getKernelReferenceKind() == KernelReferenceKind::Stub) {
const FunctionType *FT = FTy->getAs<FunctionType>();
CGM.getTargetCodeGenInfo().setOCLKernelStubCallingConvention(FT);
FTy = FT->getCanonicalTypeUnqualified();
}

// When declaring a function without a prototype, always use a
// non-variadic type.
if (CanQual<FunctionNoProtoType> noProto = FTy.getAs<FunctionNoProtoType>()) {
Expand Down Expand Up @@ -581,13 +589,11 @@ CodeGenTypes::arrangeUnprototypedObjCMessageSend(QualType returnType,
const CGFunctionInfo &
CodeGenTypes::arrangeGlobalDeclaration(GlobalDecl GD) {
// FIXME: Do we need to handle ObjCMethodDecl?
const FunctionDecl *FD = cast<FunctionDecl>(GD.getDecl());

if (isa<CXXConstructorDecl>(GD.getDecl()) ||
isa<CXXDestructorDecl>(GD.getDecl()))
return arrangeCXXStructorDeclaration(GD);

return arrangeFunctionDeclaration(FD);
return arrangeFunctionDeclaration(GD);
}

/// Arrange a thunk that takes 'this' as the first parameter followed by
Expand Down Expand Up @@ -2391,7 +2397,6 @@ void CodeGenModule::ConstructAttributeList(StringRef Name,
// Collect function IR attributes from the callee prototype if we have one.
AddAttributesFromFunctionProtoType(getContext(), FuncAttrs,
CalleeInfo.getCalleeFunctionProtoType());

const Decl *TargetDecl = CalleeInfo.getCalleeDecl().getDecl();

// Attach assumption attributes to the declaration. If this is a call
Expand Down Expand Up @@ -2498,7 +2503,11 @@ void CodeGenModule::ConstructAttributeList(StringRef Name,
NumElemsParam);
}

if (TargetDecl->hasAttr<OpenCLKernelAttr>()) {
if (TargetDecl->hasAttr<OpenCLKernelAttr>() &&
CallingConv != CallingConv::CC_C &&
CallingConv != CallingConv::CC_SpirFunction) {
// Check CallingConv to avoid adding uniform-work-group-size attribute to
// OpenCL Kernel Stub
if (getLangOpts().OpenCLVersion <= 120) {
// OpenCL v1.2 Work groups are always uniform
FuncAttrs.addAttribute("uniform-work-group-size", "true");
Expand Down
12 changes: 11 additions & 1 deletion clang/lib/CodeGen/CGExpr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5752,6 +5752,12 @@ static CGCallee EmitDirectCallee(CodeGenFunction &CGF, GlobalDecl GD) {
return CGCallee::forDirect(CalleePtr, GD);
}

static GlobalDecl getGlobalDeclForDirectCall(const FunctionDecl *FD) {
if (FD->hasAttr<OpenCLKernelAttr>())
return GlobalDecl(FD, KernelReferenceKind::Stub);
return GlobalDecl(FD);
}

CGCallee CodeGenFunction::EmitCallee(const Expr *E) {
E = E->IgnoreParens();

Expand All @@ -5765,7 +5771,7 @@ CGCallee CodeGenFunction::EmitCallee(const Expr *E) {
// Resolve direct calls.
} else if (auto DRE = dyn_cast<DeclRefExpr>(E)) {
if (auto FD = dyn_cast<FunctionDecl>(DRE->getDecl())) {
return EmitDirectCallee(*this, FD);
return EmitDirectCallee(*this, getGlobalDeclForDirectCall(FD));
}
} else if (auto ME = dyn_cast<MemberExpr>(E)) {
if (auto FD = dyn_cast<FunctionDecl>(ME->getMemberDecl())) {
Expand Down Expand Up @@ -6134,6 +6140,10 @@ RValue CodeGenFunction::EmitCall(QualType CalleeType,

const auto *FnType = cast<FunctionType>(PointeeType);

if (const auto *FD = dyn_cast_or_null<FunctionDecl>(TargetDecl);
FD && FD->hasAttr<OpenCLKernelAttr>())
CGM.getTargetCodeGenInfo().setOCLKernelStubCallingConvention(FnType);

// If we are checking indirect calls and this call is indirect, check that the
// function pointer is a member of the bit set for the function type.
if (SanOpts.has(SanitizerKind::CFIICall) &&
Expand Down
20 changes: 20 additions & 0 deletions clang/lib/CodeGen/CodeGenFunction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1595,6 +1595,26 @@ void CodeGenFunction::GenerateCode(GlobalDecl GD, llvm::Function *Fn,
// Implicit copy-assignment gets the same special treatment as implicit
// copy-constructors.
emitImplicitAssignmentOperatorBody(Args);
} else if (FD->hasAttr<OpenCLKernelAttr>() &&
GD.getKernelReferenceKind() == KernelReferenceKind::Kernel) {
CallArgList CallArgs;
for (unsigned i = 0; i < Args.size(); ++i) {
Address ArgAddr = GetAddrOfLocalVar(Args[i]);
QualType ArgQualType = Args[i]->getType();
RValue ArgRValue = convertTempToRValue(ArgAddr, ArgQualType, Loc);
CallArgs.add(ArgRValue, ArgQualType);
}
GlobalDecl GDStub = GlobalDecl(FD, KernelReferenceKind::Stub);
const FunctionType *FT = cast<FunctionType>(FD->getType());
CGM.getTargetCodeGenInfo().setOCLKernelStubCallingConvention(FT);
const CGFunctionInfo &FnInfo = CGM.getTypes().arrangeFreeFunctionCall(
CallArgs, FT, /*ChainCall=*/false);
llvm::FunctionType *FTy = CGM.getTypes().GetFunctionType(FnInfo);
llvm::Constant *GDStubFunctionPointer =
CGM.getRawFunctionPointer(GDStub, FTy);
CGCallee GDStubCallee = CGCallee::forDirect(GDStubFunctionPointer, GDStub);
EmitCall(FnInfo, GDStubCallee, ReturnValueSlot(), CallArgs, nullptr, false,
Loc);
} else if (Body) {
EmitFunctionBody(Body);
} else
Expand Down
11 changes: 11 additions & 0 deletions clang/lib/CodeGen/CodeGenModule.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1903,6 +1903,9 @@ static std::string getMangledNameImpl(CodeGenModule &CGM, GlobalDecl GD,
} else if (FD && FD->hasAttr<CUDAGlobalAttr>() &&
GD.getKernelReferenceKind() == KernelReferenceKind::Stub) {
Out << "__device_stub__" << II->getName();
} else if (FD && FD->hasAttr<OpenCLKernelAttr>() &&
GD.getKernelReferenceKind() == KernelReferenceKind::Stub) {
Out << "__clang_ocl_kern_imp_" << II->getName();
} else {
Out << II->getName();
}
Expand Down Expand Up @@ -3890,6 +3893,9 @@ void CodeGenModule::EmitGlobal(GlobalDecl GD) {

// Ignore declarations, they will be emitted on their first use.
if (const auto *FD = dyn_cast<FunctionDecl>(Global)) {
if (FD->hasAttr<OpenCLKernelAttr>() && FD->doesThisDeclarationHaveABody())
addDeferredDeclToEmit(GlobalDecl(FD, KernelReferenceKind::Stub));

// Update deferred annotations with the latest declaration if the function
// function was already used or defined.
if (FD->hasAttr<AnnotateAttr>()) {
Expand Down Expand Up @@ -4857,6 +4863,11 @@ CodeGenModule::GetAddrOfFunction(GlobalDecl GD, llvm::Type *Ty, bool ForVTable,
if (!Ty) {
const auto *FD = cast<FunctionDecl>(GD.getDecl());
Ty = getTypes().ConvertType(FD->getType());
if (FD->hasAttr<OpenCLKernelAttr>() &&
GD.getKernelReferenceKind() == KernelReferenceKind::Stub) {
const CGFunctionInfo &FI = getTypes().arrangeGlobalDeclaration(GD);
Ty = getTypes().GetFunctionType(FI);
}
}

// Devirtualized destructor calls may come through here instead of via
Expand Down
2 changes: 1 addition & 1 deletion clang/lib/CodeGen/CodeGenTypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ class CodeGenTypes {

/// Free functions are functions that are compatible with an ordinary
/// C function pointer type.
const CGFunctionInfo &arrangeFunctionDeclaration(const FunctionDecl *FD);
const CGFunctionInfo &arrangeFunctionDeclaration(const GlobalDecl GD);
const CGFunctionInfo &arrangeFreeFunctionCall(const CallArgList &Args,
const FunctionType *Ty,
bool ChainCall);
Expand Down
6 changes: 6 additions & 0 deletions clang/lib/CodeGen/TargetInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,12 @@ unsigned TargetCodeGenInfo::getOpenCLKernelCallingConv() const {
return llvm::CallingConv::SPIR_KERNEL;
}

void TargetCodeGenInfo::setOCLKernelStubCallingConvention(
const FunctionType *&FT) const {
FT = getABIInfo().getContext().adjustFunctionType(
FT, FT->getExtInfo().withCallingConv(CC_C));
}

llvm::Constant *TargetCodeGenInfo::getNullPointer(const CodeGen::CodeGenModule &CGM,
llvm::PointerType *T, QualType QT) const {
return llvm::ConstantPointerNull::get(T);
Expand Down
2 changes: 1 addition & 1 deletion clang/lib/CodeGen/TargetInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,7 @@ class TargetCodeGenInfo {
virtual bool shouldEmitDWARFBitFieldSeparators() const { return false; }

virtual void setCUDAKernelCallingConvention(const FunctionType *&FT) const {}

virtual void setOCLKernelStubCallingConvention(const FunctionType *&FT) const;
/// Return the device-side type for the CUDA device builtin surface type.
virtual llvm::Type *getCUDADeviceBuiltinSurfaceDeviceType() const {
// By default, no change from the original one.
Expand Down
8 changes: 8 additions & 0 deletions clang/lib/CodeGen/Targets/SPIR.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ class CommonSPIRTargetCodeGenInfo : public TargetCodeGenInfo {
llvm::Type *getSPIRVImageTypeFromHLSLResource(
const HLSLAttributedResourceType::Attributes &attributes,
llvm::Type *ElementType, llvm::LLVMContext &Ctx) const;
void
setOCLKernelStubCallingConvention(const FunctionType *&FT) const override;
};
class SPIRVTargetCodeGenInfo : public CommonSPIRTargetCodeGenInfo {
public:
Expand Down Expand Up @@ -230,6 +232,12 @@ void SPIRVTargetCodeGenInfo::setCUDAKernelCallingConvention(
}
}

void CommonSPIRTargetCodeGenInfo::setOCLKernelStubCallingConvention(
const FunctionType *&FT) const {
FT = getABIInfo().getContext().adjustFunctionType(
FT, FT->getExtInfo().withCallingConv(CC_SpirFunction));
}

LangAS
SPIRVTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
const VarDecl *D) const {
Expand Down
Loading