-
Notifications
You must be signed in to change notification settings - Fork 13.4k
[Clang][OpenCL][AMDGPU] OpenCL Kernel stubs should be assigned alwaysinline attribute #137769
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
41ea0f3
to
ad9e9dc
Compare
@llvm/pr-subscribers-clang @llvm/pr-subscribers-backend-amdgpu Author: Aniket Lal (lalaniket8) ChangesOpenCL Kernels body is emitted as stubs and the kernel is emitted as call to respective stub. (#115821). Patch is 239.87 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/137769.diff 11 Files Affected:
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index e917f3c42da06..384c4f3627212 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -6174,6 +6174,14 @@ void CodeGenModule::EmitGlobalFunctionDefinition(GlobalDecl GD,
CodeGenFunction(*this).GenerateCode(GD, Fn, FI);
setNonAliasAttributes(GD, Fn);
+
+ if (D->hasAttr<OpenCLKernelAttr>())
+ if (GD.getKernelReferenceKind() == KernelReferenceKind::Stub &&
+ !Fn->hasFnAttribute(llvm::Attribute::NoInline) &&
+ !Fn->hasFnAttribute(llvm::Attribute::InlineHint) &&
+ !Fn->hasFnAttribute(llvm::Attribute::OptimizeNone))
+ Fn->addFnAttr(llvm::Attribute::AlwaysInline);
+
SetLLVMFunctionAttributesForDefinition(D, Fn);
if (const ConstructorAttr *CA = D->getAttr<ConstructorAttr>())
diff --git a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
index 49604c6c5e61b..58c358672dd0e 100644
--- a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
+++ b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
@@ -128,18 +128,29 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
// X86-LABEL: define spir_kernel void @ker(
// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] {
// X86-NEXT: [[ENTRY:.*:]]
+// X86-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4
+// X86-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4
+// X86-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4
+// X86-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4
// X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4
// X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4
// X86-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4
// X86-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4
// X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4
// X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4
-// X86-NEXT: call void @__clang_ocl_kern_imp_ker(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR3:[0-9]+]]
+// X86-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[IN_ADDR_I]], align 4
+// X86-NEXT: store ptr addrspace(1) [[TMP1]], ptr [[OUT_ADDR_I]], align 4
+// X86-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_I]], align 4
+// X86-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_I]], align 4
+// X86-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3]], ptr addrspace(1) [[TMP3]], i32 1
+// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 4 [[ARRAYIDX1_I]], i32 36, i1 false)
+// X86-NEXT: call void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP_I]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4:[0-9]+]]
+// X86-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[TMP2]], ptr align 4 [[TMP_I]], i32 64, i1 false)
// X86-NEXT: ret void
//
//
// X86-LABEL: define void @__clang_ocl_kern_imp_ker(
-// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] {
+// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] {
// X86-NEXT: [[ENTRY:.*:]]
// X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4
// X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4
@@ -152,7 +163,7 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
// X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4
// X86-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3]], ptr addrspace(1) [[TMP1]], i32 1
// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i32 36, i1 false)
-// X86-NEXT: call void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// X86-NEXT: call void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[BYVAL_TEMP]]) #[[ATTR4]]
// X86-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 64, i1 false)
// X86-NEXT: ret void
//
@@ -168,18 +179,29 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
// X86-LABEL: define spir_kernel void @ker_large(
// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] {
// X86-NEXT: [[ENTRY:.*:]]
+// X86-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4
+// X86-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4
+// X86-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4
+// X86-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4
// X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4
// X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4
// X86-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4
// X86-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4
// X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4
// X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4
-// X86-NEXT: call void @__clang_ocl_kern_imp_ker_large(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR3]]
+// X86-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[IN_ADDR_I]], align 4
+// X86-NEXT: store ptr addrspace(1) [[TMP1]], ptr [[OUT_ADDR_I]], align 4
+// X86-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_I]], align 4
+// X86-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_I]], align 4
+// X86-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP3]], i32 1
+// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 4 [[ARRAYIDX1_I]], i32 4096, i1 false)
+// X86-NEXT: call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_I]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4]]
+// X86-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[TMP2]], ptr align 4 [[TMP_I]], i32 16384, i1 false)
// X86-NEXT: ret void
//
//
// X86-LABEL: define void @__clang_ocl_kern_imp_ker_large(
-// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] {
+// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] {
// X86-NEXT: [[ENTRY:.*:]]
// X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4
// X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4
@@ -192,7 +214,7 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
// X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4
// X86-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i32 1
// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i32 4096, i1 false)
-// X86-NEXT: call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// X86-NEXT: call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR4]]
// X86-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 16384, i1 false)
// X86-NEXT: ret void
//
@@ -227,16 +249,18 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
// X86-LABEL: define spir_kernel void @test_indirect_arg_local(
// X86-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] {
// X86-NEXT: [[ENTRY:.*:]]
-// X86-NEXT: call void @__clang_ocl_kern_imp_test_indirect_arg_local() #[[ATTR3]]
+// X86-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 4
+// X86-NEXT: call void @llvm.memcpy.p0.p3.i32(ptr align 4 [[BYVAL_TEMP_I]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i32 800, i1 false)
+// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
// X86-LABEL: define void @__clang_ocl_kern_imp_test_indirect_arg_local(
-// X86-SAME: ) #[[ATTR0]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] {
+// X86-SAME: ) #[[ATTR2]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] {
// X86-NEXT: [[ENTRY:.*:]]
// X86-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 4
// X86-NEXT: call void @llvm.memcpy.p0.p3.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i32 800, i1 false)
-// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
@@ -244,61 +268,74 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
// X86-SAME: ) #[[ATTR0]] {
// X86-NEXT: [[ENTRY:.*:]]
// X86-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8
-// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[P_S]]) #[[ATTR3]]
+// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[P_S]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
// X86-LABEL: define spir_kernel void @KernelOneMember(
// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] {
// X86-NEXT: [[ENTRY:.*:]]
-// X86-NEXT: call void @__clang_ocl_kern_imp_KernelOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR3]]
+// X86-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8
+// X86-NEXT: [[U1:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8
+// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[U1]], ptr align 4 [[U]], i64 8, i1 false)
+// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I]], ptr align 4 [[U1]], i32 8, i1 false)
+// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U_I]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
// X86-LABEL: define void @__clang_ocl_kern_imp_KernelOneMember(
-// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13]] {
+// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13]] {
// X86-NEXT: [[ENTRY:.*:]]
// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8
// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 8, i1 false)
-// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR3]]
+// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
// X86-LABEL: define spir_kernel void @KernelOneMemberSpir(
// X86-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] {
// X86-NEXT: [[ENTRY:.*:]]
+// X86-NEXT: [[U_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4
+// X86-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 4
// X86-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 4
// X86-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR]], align 4
// X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR]], align 4
-// X86-NEXT: call void @__clang_ocl_kern_imp_KernelOneMemberSpir(ptr addrspace(1) noundef align 8 [[TMP0]]) #[[ATTR3]]
+// X86-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[U_ADDR_I]], align 4
+// X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[U_ADDR_I]], align 4
+// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 8 [[TMP1]], i32 8, i1 false)
+// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
// X86-LABEL: define void @__clang_ocl_kern_imp_KernelOneMemberSpir(
-// X86-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META14]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] {
+// X86-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META14]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] {
// X86-NEXT: [[ENTRY:.*:]]
// X86-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 4
// X86-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 4
// X86-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR]], align 4
// X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR]], align 4
// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 8 [[TMP0]], i32 8, i1 false)
-// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
// X86-LABEL: define spir_kernel void @KernelLargeOneMember(
// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
// X86-NEXT: [[ENTRY:.*:]]
-// X86-NEXT: call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR3]]
+// X86-NEXT: [[U_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8
+// X86-NEXT: [[U1:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8
+// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[U1]], ptr align 4 [[U]], i64 800, i1 false)
+// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I]], ptr align 4 [[U1]], i32 800, i1 false)
+// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U_I]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
// X86-LABEL: define void @__clang_ocl_kern_imp_KernelLargeOneMember(
-// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
+// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
// X86-NEXT: [[ENTRY:.*:]]
// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8
// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 800, i1 false)
-// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR3]]
+// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
@@ -332,32 +369,40 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
// X86-LABEL: define spir_kernel void @KernelTwoMember(
// X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
// X86-NEXT: [[ENTRY:.*:]]
-// X86-NEXT: call void @__clang_ocl_kern_imp_KernelTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR3]]
+// X86-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8
+// X86-NEXT: [[U1:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8
+// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[U1]], ptr align 4 [[U]], i64 16, i1 false)
+// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I]], ptr align 4 [[U1]], i32 16, i1 false)
+// X86-NEXT: call void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U_I]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
// X86-LABEL: define void @__clang_ocl_kern_imp_KernelTwoMember(
-// X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
+// X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
// X86-NEXT: [[ENTRY:.*:]]
// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8
// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 16, i1 false)
-// X86-NEXT: call void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR3]]
+// X86-NEXT: call void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
// X86-LABEL...
[truncated]
|
@llvm/pr-subscribers-clang-codegen Author: Aniket Lal (lalaniket8) ChangesOpenCL Kernels body is emitted as stubs and the kernel is emitted as call to respective stub. (#115821). Patch is 239.87 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/137769.diff 11 Files Affected:
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index e917f3c42da06..384c4f3627212 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -6174,6 +6174,14 @@ void CodeGenModule::EmitGlobalFunctionDefinition(GlobalDecl GD,
CodeGenFunction(*this).GenerateCode(GD, Fn, FI);
setNonAliasAttributes(GD, Fn);
+
+ if (D->hasAttr<OpenCLKernelAttr>())
+ if (GD.getKernelReferenceKind() == KernelReferenceKind::Stub &&
+ !Fn->hasFnAttribute(llvm::Attribute::NoInline) &&
+ !Fn->hasFnAttribute(llvm::Attribute::InlineHint) &&
+ !Fn->hasFnAttribute(llvm::Attribute::OptimizeNone))
+ Fn->addFnAttr(llvm::Attribute::AlwaysInline);
+
SetLLVMFunctionAttributesForDefinition(D, Fn);
if (const ConstructorAttr *CA = D->getAttr<ConstructorAttr>())
diff --git a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
index 49604c6c5e61b..58c358672dd0e 100644
--- a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
+++ b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
@@ -128,18 +128,29 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
// X86-LABEL: define spir_kernel void @ker(
// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] {
// X86-NEXT: [[ENTRY:.*:]]
+// X86-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4
+// X86-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4
+// X86-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4
+// X86-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4
// X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4
// X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4
// X86-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4
// X86-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4
// X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4
// X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4
-// X86-NEXT: call void @__clang_ocl_kern_imp_ker(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR3:[0-9]+]]
+// X86-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[IN_ADDR_I]], align 4
+// X86-NEXT: store ptr addrspace(1) [[TMP1]], ptr [[OUT_ADDR_I]], align 4
+// X86-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_I]], align 4
+// X86-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_I]], align 4
+// X86-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3]], ptr addrspace(1) [[TMP3]], i32 1
+// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 4 [[ARRAYIDX1_I]], i32 36, i1 false)
+// X86-NEXT: call void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP_I]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4:[0-9]+]]
+// X86-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[TMP2]], ptr align 4 [[TMP_I]], i32 64, i1 false)
// X86-NEXT: ret void
//
//
// X86-LABEL: define void @__clang_ocl_kern_imp_ker(
-// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] {
+// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] {
// X86-NEXT: [[ENTRY:.*:]]
// X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4
// X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4
@@ -152,7 +163,7 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
// X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4
// X86-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3]], ptr addrspace(1) [[TMP1]], i32 1
// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i32 36, i1 false)
-// X86-NEXT: call void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// X86-NEXT: call void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[BYVAL_TEMP]]) #[[ATTR4]]
// X86-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 64, i1 false)
// X86-NEXT: ret void
//
@@ -168,18 +179,29 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
// X86-LABEL: define spir_kernel void @ker_large(
// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] {
// X86-NEXT: [[ENTRY:.*:]]
+// X86-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4
+// X86-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4
+// X86-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4
+// X86-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4
// X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4
// X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4
// X86-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4
// X86-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4
// X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4
// X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4
-// X86-NEXT: call void @__clang_ocl_kern_imp_ker_large(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR3]]
+// X86-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[IN_ADDR_I]], align 4
+// X86-NEXT: store ptr addrspace(1) [[TMP1]], ptr [[OUT_ADDR_I]], align 4
+// X86-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_I]], align 4
+// X86-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_I]], align 4
+// X86-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP3]], i32 1
+// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 4 [[ARRAYIDX1_I]], i32 4096, i1 false)
+// X86-NEXT: call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_I]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4]]
+// X86-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[TMP2]], ptr align 4 [[TMP_I]], i32 16384, i1 false)
// X86-NEXT: ret void
//
//
// X86-LABEL: define void @__clang_ocl_kern_imp_ker_large(
-// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] {
+// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] {
// X86-NEXT: [[ENTRY:.*:]]
// X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4
// X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4
@@ -192,7 +214,7 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
// X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4
// X86-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i32 1
// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i32 4096, i1 false)
-// X86-NEXT: call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// X86-NEXT: call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR4]]
// X86-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 16384, i1 false)
// X86-NEXT: ret void
//
@@ -227,16 +249,18 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
// X86-LABEL: define spir_kernel void @test_indirect_arg_local(
// X86-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] {
// X86-NEXT: [[ENTRY:.*:]]
-// X86-NEXT: call void @__clang_ocl_kern_imp_test_indirect_arg_local() #[[ATTR3]]
+// X86-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 4
+// X86-NEXT: call void @llvm.memcpy.p0.p3.i32(ptr align 4 [[BYVAL_TEMP_I]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i32 800, i1 false)
+// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
// X86-LABEL: define void @__clang_ocl_kern_imp_test_indirect_arg_local(
-// X86-SAME: ) #[[ATTR0]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] {
+// X86-SAME: ) #[[ATTR2]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] {
// X86-NEXT: [[ENTRY:.*:]]
// X86-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 4
// X86-NEXT: call void @llvm.memcpy.p0.p3.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i32 800, i1 false)
-// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
@@ -244,61 +268,74 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
// X86-SAME: ) #[[ATTR0]] {
// X86-NEXT: [[ENTRY:.*:]]
// X86-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8
-// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[P_S]]) #[[ATTR3]]
+// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[P_S]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
// X86-LABEL: define spir_kernel void @KernelOneMember(
// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] {
// X86-NEXT: [[ENTRY:.*:]]
-// X86-NEXT: call void @__clang_ocl_kern_imp_KernelOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR3]]
+// X86-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8
+// X86-NEXT: [[U1:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8
+// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[U1]], ptr align 4 [[U]], i64 8, i1 false)
+// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I]], ptr align 4 [[U1]], i32 8, i1 false)
+// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U_I]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
// X86-LABEL: define void @__clang_ocl_kern_imp_KernelOneMember(
-// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13]] {
+// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13]] {
// X86-NEXT: [[ENTRY:.*:]]
// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8
// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 8, i1 false)
-// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR3]]
+// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
// X86-LABEL: define spir_kernel void @KernelOneMemberSpir(
// X86-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] {
// X86-NEXT: [[ENTRY:.*:]]
+// X86-NEXT: [[U_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4
+// X86-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 4
// X86-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 4
// X86-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR]], align 4
// X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR]], align 4
-// X86-NEXT: call void @__clang_ocl_kern_imp_KernelOneMemberSpir(ptr addrspace(1) noundef align 8 [[TMP0]]) #[[ATTR3]]
+// X86-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[U_ADDR_I]], align 4
+// X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[U_ADDR_I]], align 4
+// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 8 [[TMP1]], i32 8, i1 false)
+// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
// X86-LABEL: define void @__clang_ocl_kern_imp_KernelOneMemberSpir(
-// X86-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META14]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] {
+// X86-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META14]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] {
// X86-NEXT: [[ENTRY:.*:]]
// X86-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 4
// X86-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 4
// X86-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR]], align 4
// X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR]], align 4
// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 8 [[TMP0]], i32 8, i1 false)
-// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
// X86-LABEL: define spir_kernel void @KernelLargeOneMember(
// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
// X86-NEXT: [[ENTRY:.*:]]
-// X86-NEXT: call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR3]]
+// X86-NEXT: [[U_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8
+// X86-NEXT: [[U1:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8
+// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[U1]], ptr align 4 [[U]], i64 800, i1 false)
+// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I]], ptr align 4 [[U1]], i32 800, i1 false)
+// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U_I]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
// X86-LABEL: define void @__clang_ocl_kern_imp_KernelLargeOneMember(
-// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
+// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
// X86-NEXT: [[ENTRY:.*:]]
// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8
// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 800, i1 false)
-// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR3]]
+// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
@@ -332,32 +369,40 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
// X86-LABEL: define spir_kernel void @KernelTwoMember(
// X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
// X86-NEXT: [[ENTRY:.*:]]
-// X86-NEXT: call void @__clang_ocl_kern_imp_KernelTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR3]]
+// X86-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8
+// X86-NEXT: [[U1:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8
+// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[U1]], ptr align 4 [[U]], i64 16, i1 false)
+// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I]], ptr align 4 [[U1]], i32 16, i1 false)
+// X86-NEXT: call void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U_I]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
// X86-LABEL: define void @__clang_ocl_kern_imp_KernelTwoMember(
-// X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
+// X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
// X86-NEXT: [[ENTRY:.*:]]
// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8
// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 16, i1 false)
-// X86-NEXT: call void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR3]]
+// X86-NEXT: call void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR4]]
// X86-NEXT: ret void
//
//
// X86-LABEL...
[truncated]
|
ad9e9dc
to
faed4e3
Compare
In practice this should be a single use of an internal function and should not require this hint. Is this papering over a different issue? |
Why do you think it is an internal function? I thought it's an externally callable stub. |
faed4e3
to
8fd3b0c
Compare
Agree. OpenCL allows calling an external kernel function https://godbolt.org/z/ThPKfxKqo so the stub function should have external linkage. |
Yes at construction, but in the real optimization pipeline it will be internalized |
Apparently the inliner has an arbitrary size threshold even for single uses. I don't see the point of that, we should probably remove that |
(CodeGenOpts.OptimizationLevel == 0) && | ||
!D->hasAttr<MinSizeAttr>(); | ||
|
||
if (D->hasAttr<OpenCLKernelAttr>()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Braces
bool ShouldAddOptNone = !CodeGenOpts.DisableO0ImplyOptNone && | ||
(CodeGenOpts.OptimizationLevel == 0) && | ||
!D->hasAttr<MinSizeAttr>(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Really hate optnone being used to implement -O0
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am following the logic for ShouldAddOptNone
used later in SetLLVMFunctionAttributesForDefinition(D, Fn);
here https://github.com/llvm/llvm-project/blob/main/clang/lib/CodeGen/CodeGenModule.cpp#L2552 to decide if optnone and noinline attribute should be added.
If we don't do the same check we end up adding alwaysinline to the stub in cases where ShouldAddOptNone
would be true. This prevents adding of optnone and noinline later in https://github.com/llvm/llvm-project/blob/main/clang/lib/CodeGen/CodeGenModule.cpp#L2552
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I know why you did this. I'm objecting to the fact that you need to
if (GD.getKernelReferenceKind() == KernelReferenceKind::Stub && | ||
!D->hasAttr<NoInlineAttr>() && | ||
!Fn->hasFnAttribute(llvm::Attribute::NoInline) && | ||
!D->hasAttr<OptimizeNoneAttr>() && | ||
!Fn->hasFnAttribute(llvm::Attribute::OptimizeNone) && !ShouldAddOptNone) | ||
Fn->addFnAttr(llvm::Attribute::AlwaysInline); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are fewer test changes than I expect for this. Do we have tests with explicit optnone or noinline placed on kernels?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So far only one test https://github.com/llvm/llvm-project/blob/main/clang/test/CodeGenOpenCL/opencl-kernel-call.cl#L69
adds noinline to kernel function. This test was added as part of the orignal PR that introduced OpenCL kernel to kernel call feature.
OpenCL Kernels body is emitted as stubs and the kernel is emitted as call to respective stub. (#115821).
The stub function should be alwaysinlined, since call to stub can cause performance drop.