From df47d76994cb706430364bc5e878beb1f01e62fa Mon Sep 17 00:00:00 2001 From: anikelal Date: Mon, 25 Nov 2024 14:18:36 +0530 Subject: [PATCH 1/8] [Clang][OpenCL][AMDGPU] Allow a kernel to call another kernel This feature is currently not supported in the compiler. To facilitate this we emit a stub version of each kernel function body with different name mangling scheme, and replaces the respective kernel call-sites appropriately. Fixes https://github.com/llvm/llvm-project/issues/60313 D120566 was an earlier attempt made to upstream a solution for this issue. --- clang/include/clang/AST/GlobalDecl.h | 37 +++++++++---- clang/lib/AST/Expr.cpp | 3 +- clang/lib/AST/ItaniumMangle.cpp | 15 ++++++ clang/lib/AST/Mangle.cpp | 2 +- clang/lib/AST/MicrosoftMangle.cpp | 6 +++ clang/lib/CodeGen/CGBlocks.cpp | 16 +++--- clang/lib/CodeGen/CGCall.cpp | 11 +++- clang/lib/CodeGen/CGExpr.cpp | 5 +- clang/lib/CodeGen/CGOpenCLRuntime.cpp | 10 +++- clang/lib/CodeGen/CGOpenCLRuntime.h | 4 +- clang/lib/CodeGen/CodeGenModule.cpp | 7 +++ .../test/CodeGenOpenCL/opencl-kernel-call.cl | 43 +++++++++++++++ clang/test/CodeGenOpenCL/spir-calling-conv.cl | 4 +- clang/test/CodeGenOpenCL/visibility.cl | 53 ++++++++++++++----- 14 files changed, 176 insertions(+), 40 deletions(-) create mode 100644 clang/test/CodeGenOpenCL/opencl-kernel-call.cl diff --git a/clang/include/clang/AST/GlobalDecl.h b/clang/include/clang/AST/GlobalDecl.h index 386693cabb1fb..8a9f4b4c60e5e 100644 --- a/clang/include/clang/AST/GlobalDecl.h +++ b/clang/include/clang/AST/GlobalDecl.h @@ -71,6 +71,10 @@ class GlobalDecl { GlobalDecl(const FunctionDecl *D, unsigned MVIndex = 0) : MultiVersionIndex(MVIndex) { if (!D->hasAttr()) { + if (D->hasAttr()) { + Value.setPointerAndInt(D, unsigned(KernelReferenceKind::Kernel)); + return; + } Init(D); return; } @@ -78,7 +82,8 @@ class GlobalDecl { } GlobalDecl(const FunctionDecl *D, KernelReferenceKind Kind) : Value(D, unsigned(Kind)) { - assert(D->hasAttr() && "Decl is not a GPU kernel!"); + assert((D->hasAttr() && "Decl is not a GPU kernel!") || + (D->hasAttr() && "Decl is not a OpenCL kernel!")); } GlobalDecl(const NamedDecl *D) { Init(D); } GlobalDecl(const BlockDecl *D) { Init(D); } @@ -130,13 +135,15 @@ class GlobalDecl { } KernelReferenceKind getKernelReferenceKind() const { - assert(((isa(getDecl()) && - cast(getDecl())->hasAttr()) || - (isa(getDecl()) && - cast(getDecl()) - ->getTemplatedDecl() - ->hasAttr())) && - "Decl is not a GPU kernel!"); + assert((((isa(getDecl()) && + cast(getDecl())->hasAttr()) || + (isa(getDecl()) && + cast(getDecl()) + ->getTemplatedDecl() + ->hasAttr())) && + "Decl is not a GPU kernel!") || + (isDeclOpenCLKernel() && "Decl is not a OpenCL kernel!")); + return static_cast(Value.getInt()); } @@ -196,13 +203,21 @@ class GlobalDecl { } GlobalDecl getWithKernelReferenceKind(KernelReferenceKind Kind) { - assert(isa(getDecl()) && - cast(getDecl())->hasAttr() && - "Decl is not a GPU kernel!"); + assert((isa(getDecl()) && + cast(getDecl())->hasAttr() && + "Decl is not a GPU kernel!") || + (isDeclOpenCLKernel() && "Decl is not a OpenCL kernel!")); GlobalDecl Result(*this); Result.Value.setInt(unsigned(Kind)); return Result; } + + bool isDeclOpenCLKernel() const { + auto FD = dyn_cast(getDecl()); + if (FD) + return FD->hasAttr(); + return FD; + } }; } // namespace clang diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp index e8e5f2fa0cc12..7af3c683ea805 100644 --- a/clang/lib/AST/Expr.cpp +++ b/clang/lib/AST/Expr.cpp @@ -695,7 +695,8 @@ std::string PredefinedExpr::ComputeName(PredefinedIdentKind IK, GD = GlobalDecl(CD, Ctor_Base); else if (const CXXDestructorDecl *DD = dyn_cast(ND)) GD = GlobalDecl(DD, Dtor_Base); - else if (ND->hasAttr()) + else if (ND->hasAttr() || + ND->hasAttr()) GD = GlobalDecl(cast(ND)); else GD = GlobalDecl(ND); diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp index fdd84d0bf7c5c..29c0bdddca7a0 100644 --- a/clang/lib/AST/ItaniumMangle.cpp +++ b/clang/lib/AST/ItaniumMangle.cpp @@ -526,6 +526,7 @@ class CXXNameMangler { void mangleSourceName(const IdentifierInfo *II); void mangleRegCallName(const IdentifierInfo *II); void mangleDeviceStubName(const IdentifierInfo *II); + void mangleOCLDeviceStubName(const IdentifierInfo *II); void mangleSourceNameWithAbiTags( const NamedDecl *ND, const AbiTagList *AdditionalAbiTags = nullptr); void mangleLocalName(GlobalDecl GD, @@ -1561,8 +1562,13 @@ void CXXNameMangler::mangleUnqualifiedName( bool IsDeviceStub = FD && FD->hasAttr() && GD.getKernelReferenceKind() == KernelReferenceKind::Stub; + bool IsOCLDeviceStub = + FD && FD->hasAttr() && + GD.getKernelReferenceKind() == KernelReferenceKind::Stub; if (IsDeviceStub) mangleDeviceStubName(II); + else if (IsOCLDeviceStub) + mangleOCLDeviceStubName(II); else if (IsRegCall) mangleRegCallName(II); else @@ -1780,6 +1786,15 @@ void CXXNameMangler::mangleDeviceStubName(const IdentifierInfo *II) { << II->getName(); } +void CXXNameMangler::mangleOCLDeviceStubName(const IdentifierInfo *II) { + // ::= __clang_ocl_kern_imp_ + // ::= [n] + // ::= + StringRef OCLDeviceStubNamePrefix = "__clang_ocl_kern_imp_"; + Out << II->getLength() + OCLDeviceStubNamePrefix.size() - 1 + << OCLDeviceStubNamePrefix << II->getName(); +} + void CXXNameMangler::mangleSourceName(const IdentifierInfo *II) { // ::= // ::= [n] diff --git a/clang/lib/AST/Mangle.cpp b/clang/lib/AST/Mangle.cpp index b44ab23f1d0e1..96ae823bf55e0 100644 --- a/clang/lib/AST/Mangle.cpp +++ b/clang/lib/AST/Mangle.cpp @@ -540,7 +540,7 @@ class ASTNameGenerator::Implementation { GD = GlobalDecl(CtorD, Ctor_Complete); else if (const auto *DtorD = dyn_cast(D)) GD = GlobalDecl(DtorD, Dtor_Complete); - else if (D->hasAttr()) + else if (D->hasAttr() || D->hasAttr()) GD = GlobalDecl(cast(D)); else GD = GlobalDecl(D); diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp index 7e964124a9fec..63291b23f10f4 100644 --- a/clang/lib/AST/MicrosoftMangle.cpp +++ b/clang/lib/AST/MicrosoftMangle.cpp @@ -1162,9 +1162,15 @@ void MicrosoftCXXNameMangler::mangleUnqualifiedName(GlobalDecl GD, ->getTemplatedDecl() ->hasAttr())) && GD.getKernelReferenceKind() == KernelReferenceKind::Stub; + bool IsOCLDeviceStub = + ND && (isa(ND) && ND->hasAttr()) && + GD.getKernelReferenceKind() == KernelReferenceKind::Stub; if (IsDeviceStub) mangleSourceName( (llvm::Twine("__device_stub__") + II->getName()).str()); + else if (IsOCLDeviceStub) + mangleSourceName( + (llvm::Twine("__clang_ocl_kern_imp_") + II->getName()).str()); else mangleSourceName(II->getName()); break; diff --git a/clang/lib/CodeGen/CGBlocks.cpp b/clang/lib/CodeGen/CGBlocks.cpp index faef6a5fbe1f5..b871c31a039d3 100644 --- a/clang/lib/CodeGen/CGBlocks.cpp +++ b/clang/lib/CodeGen/CGBlocks.cpp @@ -48,7 +48,7 @@ CGBlockInfo::CGBlockInfo(const BlockDecl *block, StringRef name) BlockByrefHelpers::~BlockByrefHelpers() {} /// Build the given block as a global block. -static llvm::Constant *buildGlobalBlock(CodeGenModule &CGM, +static llvm::Constant *buildGlobalBlock(CodeGenModule &CGM, GlobalDecl GD, const CGBlockInfo &blockInfo, llvm::Constant *blockFn); @@ -1085,8 +1085,10 @@ llvm::Value *CodeGenFunction::EmitBlockLiteral(const CGBlockInfo &blockInfo) { blockAddr.getPointer(), ConvertType(blockInfo.getBlockExpr()->getType())); if (IsOpenCL) { - CGM.getOpenCLRuntime().recordBlockInfo(blockInfo.BlockExpression, InvokeFn, - result, blockInfo.StructureType); + CGM.getOpenCLRuntime().recordBlockInfo( + blockInfo.BlockExpression, InvokeFn, result, blockInfo.StructureType, + CurGD && CurGD.isDeclOpenCLKernel() && + (CurGD.getKernelReferenceKind() == KernelReferenceKind::Kernel)); } return result; @@ -1264,7 +1266,7 @@ CodeGenModule::GetAddrOfGlobalBlock(const BlockExpr *BE, return getAddrOfGlobalBlockIfEmitted(BE); } -static llvm::Constant *buildGlobalBlock(CodeGenModule &CGM, +static llvm::Constant *buildGlobalBlock(CodeGenModule &CGM, GlobalDecl GD, const CGBlockInfo &blockInfo, llvm::Constant *blockFn) { assert(blockInfo.CanBeGlobal); @@ -1357,7 +1359,9 @@ static llvm::Constant *buildGlobalBlock(CodeGenModule &CGM, CGM.getOpenCLRuntime().recordBlockInfo( blockInfo.BlockExpression, cast(blockFn->stripPointerCasts()), Result, - literal->getValueType()); + literal->getValueType(), + GD && GD.isDeclOpenCLKernel() && + (GD.getKernelReferenceKind() == KernelReferenceKind::Kernel)); return Result; } @@ -1466,7 +1470,7 @@ llvm::Function *CodeGenFunction::GenerateBlockFunction( auto GenVoidPtrTy = getContext().getLangOpts().OpenCL ? CGM.getOpenCLRuntime().getGenericVoidPointerType() : VoidPtrTy; - buildGlobalBlock(CGM, blockInfo, + buildGlobalBlock(CGM, CurGD, blockInfo, llvm::ConstantExpr::getPointerCast(fn, GenVoidPtrTy)); } diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index b202255c3a15b..55e1fdb5ff389 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -2383,6 +2383,15 @@ void CodeGenModule::ConstructAttributeList(StringRef Name, // Collect function IR attributes from the CC lowering. // We'll collect the paramete and result attributes later. CallingConv = FI.getEffectiveCallingConvention(); + GlobalDecl GD = CalleeInfo.getCalleeDecl(); + const Decl *TargetDecl = CalleeInfo.getCalleeDecl().getDecl(); + if (TargetDecl) { + if (auto FD = dyn_cast(TargetDecl)) { + if (FD->hasAttr() && + GD.getKernelReferenceKind() == KernelReferenceKind::Stub) + CallingConv = llvm::CallingConv::C; + } + } if (FI.isNoReturn()) FuncAttrs.addAttribute(llvm::Attribute::NoReturn); if (FI.isCmseNSCall()) @@ -2392,8 +2401,6 @@ void CodeGenModule::ConstructAttributeList(StringRef Name, AddAttributesFromFunctionProtoType(getContext(), FuncAttrs, CalleeInfo.getCalleeFunctionProtoType()); - const Decl *TargetDecl = CalleeInfo.getCalleeDecl().getDecl(); - // Attach assumption attributes to the declaration. If this is a call // site, attach assumptions from the caller to the call as well. AddAttributesFromOMPAssumes(FuncAttrs, TargetDecl); diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index 7fe2d9582178e..1a5c5f2cf2d7f 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -5765,7 +5765,10 @@ CGCallee CodeGenFunction::EmitCallee(const Expr *E) { // Resolve direct calls. } else if (auto DRE = dyn_cast(E)) { if (auto FD = dyn_cast(DRE->getDecl())) { - return EmitDirectCallee(*this, FD); + auto CalleeDecl = FD->hasAttr() + ? GlobalDecl(FD, KernelReferenceKind::Stub) + : FD; + return EmitDirectCallee(*this, CalleeDecl); } } else if (auto ME = dyn_cast(E)) { if (auto FD = dyn_cast(ME->getMemberDecl())) { diff --git a/clang/lib/CodeGen/CGOpenCLRuntime.cpp b/clang/lib/CodeGen/CGOpenCLRuntime.cpp index 9f8ff488755ec..1bf74b966ef4c 100644 --- a/clang/lib/CodeGen/CGOpenCLRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenCLRuntime.cpp @@ -126,8 +126,14 @@ static const BlockExpr *getBlockExpr(const Expr *E) { /// corresponding block expression. void CGOpenCLRuntime::recordBlockInfo(const BlockExpr *E, llvm::Function *InvokeF, - llvm::Value *Block, llvm::Type *BlockTy) { - assert(!EnqueuedBlockMap.contains(E) && "Block expression emitted twice"); + llvm::Value *Block, llvm::Type *BlockTy, + bool isBlkExprInOCLKern) { + + // FIXME: Since OpenCL Kernels are emitted twice (kernel version and stub + // version), its constituent BlockExpr will also be emitted twice. + assert((!EnqueuedBlockMap.contains(E) || + EnqueuedBlockMap[E].isBlkExprInOCLKern != isBlkExprInOCLKern) && + "Block expression emitted twice"); assert(isa(InvokeF) && "Invalid invoke function"); assert(Block->getType()->isPointerTy() && "Invalid block literal type"); EnqueuedBlockInfo &BlockInfo = EnqueuedBlockMap[E]; diff --git a/clang/lib/CodeGen/CGOpenCLRuntime.h b/clang/lib/CodeGen/CGOpenCLRuntime.h index 34613c3516f37..78bb5980cd87d 100644 --- a/clang/lib/CodeGen/CGOpenCLRuntime.h +++ b/clang/lib/CodeGen/CGOpenCLRuntime.h @@ -46,6 +46,7 @@ class CGOpenCLRuntime { llvm::Value *KernelHandle; /// Enqueued block kernel reference. llvm::Value *BlockArg; /// The first argument to enqueued block kernel. llvm::Type *BlockTy; /// Type of the block argument. + bool isBlkExprInOCLKern; /// Does the BlockExpr reside in an OpenCL Kernel. }; /// Maps block expression to block information. llvm::DenseMap EnqueuedBlockMap; @@ -93,7 +94,8 @@ class CGOpenCLRuntime { /// \param InvokeF invoke function emitted for the block expression. /// \param Block block literal emitted for the block expression. void recordBlockInfo(const BlockExpr *E, llvm::Function *InvokeF, - llvm::Value *Block, llvm::Type *BlockTy); + llvm::Value *Block, llvm::Type *BlockTy, + bool isBlkExprInOCLKern); /// \return LLVM block invoke function emitted for an expression derived from /// the block expression. diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 8f9cf965af2b9..726414aed49f4 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -1903,6 +1903,9 @@ static std::string getMangledNameImpl(CodeGenModule &CGM, GlobalDecl GD, } else if (FD && FD->hasAttr() && GD.getKernelReferenceKind() == KernelReferenceKind::Stub) { Out << "__device_stub__" << II->getName(); + } else if (FD && FD->hasAttr() && + GD.getKernelReferenceKind() == KernelReferenceKind::Stub) { + Out << "__clang_ocl_kern_imp_" << II->getName(); } else { Out << II->getName(); } @@ -3890,6 +3893,10 @@ void CodeGenModule::EmitGlobal(GlobalDecl GD) { // Ignore declarations, they will be emitted on their first use. if (const auto *FD = dyn_cast(Global)) { + + if (FD->hasAttr() && FD->doesThisDeclarationHaveABody()) + addDeferredDeclToEmit(GlobalDecl(FD, KernelReferenceKind::Stub)); + // Update deferred annotations with the latest declaration if the function // function was already used or defined. if (FD->hasAttr()) { diff --git a/clang/test/CodeGenOpenCL/opencl-kernel-call.cl b/clang/test/CodeGenOpenCL/opencl-kernel-call.cl new file mode 100644 index 0000000000000..f575728f23763 --- /dev/null +++ b/clang/test/CodeGenOpenCL/opencl-kernel-call.cl @@ -0,0 +1,43 @@ +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -emit-llvm -o - %s | FileCheck %s + +// CHECK: define dso_local amdgpu_kernel void @callee_kern({{.*}}) +__attribute__((noinline)) kernel void callee_kern(global int *A){ + *A = 1; +} + +__attribute__((noinline)) kernel void ext_callee_kern(global int *A); + +// CHECK: define dso_local void @callee_func({{.*}}) +__attribute__((noinline)) void callee_func(global int *A){ + *A = 2; +} + +// CHECK: define dso_local amdgpu_kernel void @caller_kern({{.*}}) +kernel void caller_kern(global int* A){ + callee_kern(A); + // CHECK: tail call void @__clang_ocl_kern_imp_callee_kern({{.*}}) + ext_callee_kern(A); + // CHECK: tail call void @__clang_ocl_kern_imp_ext_callee_kern({{.*}}) + callee_func(A); + // CHECK: tail call void @callee_func({{.*}}) + +} + +// CHECK: define dso_local void @__clang_ocl_kern_imp_callee_kern({{.*}}) + +// CHECK: declare void @__clang_ocl_kern_imp_ext_callee_kern({{.*}}) + +// CHECK: define dso_local void @caller_func({{.*}}) +void caller_func(global int* A){ + callee_kern(A); + // CHECK: tail call void @__clang_ocl_kern_imp_callee_kern({{.*}}) #7 + ext_callee_kern(A); + // CHECK: tail call void @__clang_ocl_kern_imp_ext_callee_kern({{.*}}) #8 + callee_func(A); + // CHECK: tail call void @callee_func({{.*}}) +} + +// CHECK: define dso_local void @__clang_ocl_kern_imp_caller_kern({{.*}}) +// CHECK: tail call void @__clang_ocl_kern_imp_callee_kern({{.*}}) +// CHECK: tail call void @__clang_ocl_kern_imp_ext_callee_kern({{.*}}) +// CHECK: tail call void @callee_func({{.*}}) diff --git a/clang/test/CodeGenOpenCL/spir-calling-conv.cl b/clang/test/CodeGenOpenCL/spir-calling-conv.cl index 569ea0cbe1af6..6c8f20511b8bd 100644 --- a/clang/test/CodeGenOpenCL/spir-calling-conv.cl +++ b/clang/test/CodeGenOpenCL/spir-calling-conv.cl @@ -11,8 +11,8 @@ kernel void foo(global int *A) // CHECK: %{{[a-z0-9_]+}} = tail call spir_func i32 @get_dummy_id(i32 noundef 0) A[id] = id; bar(A); - // CHECK: tail call spir_kernel void @bar(ptr addrspace(1) noundef align 4 %A) + // CHECK: tail call void @__clang_ocl_kern_imp_bar(ptr addrspace(1) noundef align 4 %A) } // CHECK: declare spir_func i32 @get_dummy_id(i32 noundef) -// CHECK: declare spir_kernel void @bar(ptr addrspace(1) noundef align 4) +// CHECK: declare void @__clang_ocl_kern_imp_bar(ptr addrspace(1) noundef align 4) diff --git a/clang/test/CodeGenOpenCL/visibility.cl b/clang/test/CodeGenOpenCL/visibility.cl index addfe33377f93..e5dc5b29c5140 100644 --- a/clang/test/CodeGenOpenCL/visibility.cl +++ b/clang/test/CodeGenOpenCL/visibility.cl @@ -85,31 +85,42 @@ __attribute__((visibility("default"))) extern void ext_func_default(); void use() { glob = ext + ext_hidden + ext_protected + ext_default; ext_kern(); + // FVIS-DEFAULT: tail call void @__clang_ocl_kern_imp_ext_kern() + // FVIS-PROTECTED: tail call void @__clang_ocl_kern_imp_ext_kern() + // FVIS-HIDDEN: tail call void @__clang_ocl_kern_imp_ext_kern() ext_kern_hidden(); + // FVIS-DEFAULT: tail call void @__clang_ocl_kern_imp_ext_kern_hidden() + // FVIS-PROTECTED: tail call void @__clang_ocl_kern_imp_ext_kern_hidden() + // FVIS-HIDDEN: tail call void @__clang_ocl_kern_imp_ext_kern_hidden() ext_kern_protected(); + // FVIS-DEFAULT: tail call void @__clang_ocl_kern_imp_ext_kern_protected() + // FVIS-PROTECTED: tail call void @__clang_ocl_kern_imp_ext_kern_protected() + // FVIS-HIDDEN: tail call void @__clang_ocl_kern_imp_ext_kern_protected() ext_kern_default(); + // FVIS-DEFAULT: tail call void @__clang_ocl_kern_imp_ext_kern_default() + // FVIS-PROTECTED: tail call void @__clang_ocl_kern_imp_ext_kern_default() + // FVIS-HIDDEN: tail call void @__clang_ocl_kern_imp_ext_kern_default() ext_func(); ext_func_hidden(); ext_func_protected(); ext_func_default(); } -// FVIS-DEFAULT: declare amdgpu_kernel void @ext_kern() -// FVIS-PROTECTED: declare protected amdgpu_kernel void @ext_kern() -// FVIS-HIDDEN: declare protected amdgpu_kernel void @ext_kern() +// FVIS-DEFAULT: declare void @__clang_ocl_kern_imp_ext_kern() +// FVIS-PROTECTED: declare protected void @__clang_ocl_kern_imp_ext_kern() +// FVIS-HIDDEN: declare protected void @__clang_ocl_kern_imp_ext_kern() -// FVIS-DEFAULT: declare protected amdgpu_kernel void @ext_kern_hidden() -// FVIS-PROTECTED: declare protected amdgpu_kernel void @ext_kern_hidden() -// FVIS-HIDDEN: declare protected amdgpu_kernel void @ext_kern_hidden() +// FVIS-DEFAULT: declare protected void @__clang_ocl_kern_imp_ext_kern_hidden() +// FVIS-PROTECTED: declare protected void @__clang_ocl_kern_imp_ext_kern_hidden() +// FVIS-HIDDEN: declare protected void @__clang_ocl_kern_imp_ext_kern_hidden() -// FVIS-DEFAULT: declare protected amdgpu_kernel void @ext_kern_protected() -// FVIS-PROTECTED: declare protected amdgpu_kernel void @ext_kern_protected() -// FVIS-HIDDEN: declare protected amdgpu_kernel void @ext_kern_protected() - -// FVIS-DEFAULT: declare amdgpu_kernel void @ext_kern_default() -// FVIS-PROTECTED: declare amdgpu_kernel void @ext_kern_default() -// FVIS-HIDDEN: declare amdgpu_kernel void @ext_kern_default() +// FVIS-DEFAULT: declare protected void @__clang_ocl_kern_imp_ext_kern_protected() +// FVIS-PROTECTED: declare protected void @__clang_ocl_kern_imp_ext_kern_protected() +// FVIS-HIDDEN: declare protected void @__clang_ocl_kern_imp_ext_kern_protected() +// FVIS-DEFAULT: declare void @__clang_ocl_kern_imp_ext_kern_default() +// FVIS-PROTECTED: declare void @__clang_ocl_kern_imp_ext_kern_default() +// FVIS-HIDDEN: declare void @__clang_ocl_kern_imp_ext_kern_default() // FVIS-DEFAULT: declare void @ext_func() // FVIS-PROTECTED: declare protected void @ext_func() @@ -126,3 +137,19 @@ void use() { // FVIS-DEFAULT: declare void @ext_func_default() // FVIS-PROTECTED: declare void @ext_func_default() // FVIS-HIDDEN: declare void @ext_func_default() + +// FVIS-DEFAULT: define{{.*}} void @__clang_ocl_kern_imp_kern() +// FVIS-PROTECTED: define protected void @__clang_ocl_kern_imp_kern() +// FVIS-HIDDEN: define protected void @__clang_ocl_kern_imp_kern() + +// FVIS-DEFAULT: define protected void @__clang_ocl_kern_imp_kern_hidden() +// FVIS-PROTECTED: define protected void @__clang_ocl_kern_imp_kern_hidden() +// FVIS-HIDDEN: define protected void @__clang_ocl_kern_imp_kern_hidden() + +// FVIS-DEFAULT: define protected void @__clang_ocl_kern_imp_kern_protected() +// FVIS-PROTECTED: define protected void @__clang_ocl_kern_imp_kern_protected() +// FVIS-HIDDEN: define protected void @__clang_ocl_kern_imp_kern_protected() + +// FVIS-DEFAULT: define{{.*}} void @__clang_ocl_kern_imp_kern_default() +// FVIS-PROTECTED: define{{.*}} void @__clang_ocl_kern_imp_kern_default() +// FVIS-HIDDEN: define{{.*}} void @__clang_ocl_kern_imp_kern_default() From 24e84d2fbb84190a81296739af549a966557d2f8 Mon Sep 17 00:00:00 2001 From: anikelal Date: Fri, 29 Nov 2024 14:03:57 +0530 Subject: [PATCH 2/8] [Clang][OpenCL][AMDGPU] Allow a kernel to call another kernel Simplifying isDeclOpenCLKernel() and removing resolved comments --- clang/include/clang/AST/GlobalDecl.h | 5 ++--- clang/lib/CodeGen/CGOpenCLRuntime.cpp | 2 -- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/clang/include/clang/AST/GlobalDecl.h b/clang/include/clang/AST/GlobalDecl.h index 8a9f4b4c60e5e..342232dcf5641 100644 --- a/clang/include/clang/AST/GlobalDecl.h +++ b/clang/include/clang/AST/GlobalDecl.h @@ -213,10 +213,9 @@ class GlobalDecl { } bool isDeclOpenCLKernel() const { - auto FD = dyn_cast(getDecl()); - if (FD) + if (auto FD = dyn_cast(getDecl())) return FD->hasAttr(); - return FD; + return false; } }; diff --git a/clang/lib/CodeGen/CGOpenCLRuntime.cpp b/clang/lib/CodeGen/CGOpenCLRuntime.cpp index 1bf74b966ef4c..4481c06d5eb2a 100644 --- a/clang/lib/CodeGen/CGOpenCLRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenCLRuntime.cpp @@ -129,8 +129,6 @@ void CGOpenCLRuntime::recordBlockInfo(const BlockExpr *E, llvm::Value *Block, llvm::Type *BlockTy, bool isBlkExprInOCLKern) { - // FIXME: Since OpenCL Kernels are emitted twice (kernel version and stub - // version), its constituent BlockExpr will also be emitted twice. assert((!EnqueuedBlockMap.contains(E) || EnqueuedBlockMap[E].isBlkExprInOCLKern != isBlkExprInOCLKern) && "Block expression emitted twice"); From 3f35a63a257c828c6cd1b44a9227b815e7b4a790 Mon Sep 17 00:00:00 2001 From: anikelal Date: Wed, 22 Jan 2025 15:24:11 +0530 Subject: [PATCH 3/8] Emitting OpenCL Kernels as calls to their respective inlined stubs to avoid double emission of kernel body. --- clang/include/clang/AST/Decl.h | 2 + clang/include/clang/AST/GlobalDecl.h | 46 +- clang/lib/AST/Decl.cpp | 4 + clang/lib/AST/Expr.cpp | 7 +- clang/lib/AST/ItaniumMangle.cpp | 2 +- clang/lib/AST/Mangle.cpp | 6 +- clang/lib/CodeGen/CGBlocks.cpp | 16 +- clang/lib/CodeGen/CGCall.cpp | 25 +- clang/lib/CodeGen/CGExpr.cpp | 17 +- clang/lib/CodeGen/CGOpenCLRuntime.cpp | 8 +- clang/lib/CodeGen/CGOpenCLRuntime.h | 4 +- clang/lib/CodeGen/CodeGenFunction.cpp | 20 + clang/lib/CodeGen/CodeGenModule.cpp | 16 + clang/lib/CodeGen/CodeGenTypes.h | 2 +- clang/lib/CodeGen/TargetInfo.h | 3 +- clang/lib/CodeGen/Targets/AMDGPU.cpp | 8 + clang/lib/CodeGen/Targets/SPIR.cpp | 8 + clang/lib/CodeGen/Targets/X86.cpp | 6 + .../CodeGenOpenCL/addr-space-struct-arg.cl | 570 +++++++++++------- .../amdgpu-abi-struct-arg-byref.cl | 115 ++-- .../CodeGenOpenCL/amdgpu-abi-struct-coerce.cl | 2 +- .../CodeGenOpenCL/amdgpu-enqueue-kernel.cl | 361 ++++++----- clang/test/CodeGenOpenCL/amdgpu-printf.cl | 91 ++- .../test/CodeGenOpenCL/cl-uniform-wg-size.cl | 8 +- .../cl20-device-side-enqueue-attributes.cl | 132 +++- .../CodeGenOpenCL/cl20-device-side-enqueue.cl | 2 +- clang/test/CodeGenOpenCL/convergent.cl | 9 +- .../enqueue-kernel-non-entry-block.cl | 2 +- clang/test/CodeGenOpenCL/kernel-arg-info.cl | 10 + .../test/CodeGenOpenCL/opencl-kernel-call.cl | 101 +++- clang/test/CodeGenOpenCL/reflect.cl | 9 +- clang/test/CodeGenOpenCL/spir-calling-conv.cl | 4 +- clang/test/CodeGenOpenCL/visibility.cl | 26 +- .../CodeGenOpenCLCXX/addrspace-of-this.clcpp | 7 +- 34 files changed, 1082 insertions(+), 567 deletions(-) diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h index ff1d3497b77c3..798f112ce7200 100644 --- a/clang/include/clang/AST/Decl.h +++ b/clang/include/clang/AST/Decl.h @@ -3048,6 +3048,8 @@ class FunctionDecl : public DeclaratorDecl, static FunctionDecl *castFromDeclContext(const DeclContext *DC) { return static_cast(const_cast(DC)); } + + bool isReferenceableKernel() const; }; /// Represents a member of a struct/union/class. diff --git a/clang/include/clang/AST/GlobalDecl.h b/clang/include/clang/AST/GlobalDecl.h index 342232dcf5641..df11a79a56b3b 100644 --- a/clang/include/clang/AST/GlobalDecl.h +++ b/clang/include/clang/AST/GlobalDecl.h @@ -70,20 +70,15 @@ class GlobalDecl { GlobalDecl(const VarDecl *D) { Init(D);} GlobalDecl(const FunctionDecl *D, unsigned MVIndex = 0) : MultiVersionIndex(MVIndex) { - if (!D->hasAttr()) { - if (D->hasAttr()) { - Value.setPointerAndInt(D, unsigned(KernelReferenceKind::Kernel)); - return; - } - Init(D); + if (D->isReferenceableKernel()) { + Value.setPointerAndInt(D, unsigned(getDefaultKernelReference(D))); return; } - Value.setPointerAndInt(D, unsigned(getDefaultKernelReference(D))); + Init(D); } GlobalDecl(const FunctionDecl *D, KernelReferenceKind Kind) : Value(D, unsigned(Kind)) { - assert((D->hasAttr() && "Decl is not a GPU kernel!") || - (D->hasAttr() && "Decl is not a OpenCL kernel!")); + assert(D->isReferenceableKernel() && "Decl is not a GPU kernel!"); } GlobalDecl(const NamedDecl *D) { Init(D); } GlobalDecl(const BlockDecl *D) { Init(D); } @@ -135,14 +130,13 @@ class GlobalDecl { } KernelReferenceKind getKernelReferenceKind() const { - assert((((isa(getDecl()) && - cast(getDecl())->hasAttr()) || - (isa(getDecl()) && - cast(getDecl()) - ->getTemplatedDecl() - ->hasAttr())) && - "Decl is not a GPU kernel!") || - (isDeclOpenCLKernel() && "Decl is not a OpenCL kernel!")); + assert(((isa(getDecl()) && + cast(getDecl())->isReferenceableKernel()) || + (isa(getDecl()) && + cast(getDecl()) + ->getTemplatedDecl() + ->hasAttr())) && + "Decl is not a GPU kernel!"); return static_cast(Value.getInt()); } @@ -167,8 +161,9 @@ class GlobalDecl { } static KernelReferenceKind getDefaultKernelReference(const FunctionDecl *D) { - return D->getLangOpts().CUDAIsDevice ? KernelReferenceKind::Kernel - : KernelReferenceKind::Stub; + return (D->hasAttr() || D->getLangOpts().CUDAIsDevice) + ? KernelReferenceKind::Kernel + : KernelReferenceKind::Stub; } GlobalDecl getWithDecl(const Decl *D) { @@ -203,20 +198,13 @@ class GlobalDecl { } GlobalDecl getWithKernelReferenceKind(KernelReferenceKind Kind) { - assert((isa(getDecl()) && - cast(getDecl())->hasAttr() && - "Decl is not a GPU kernel!") || - (isDeclOpenCLKernel() && "Decl is not a OpenCL kernel!")); + assert(isa(getDecl()) && + cast(getDecl())->isReferenceableKernel() && + "Decl is not a GPU kernel!"); GlobalDecl Result(*this); Result.Value.setInt(unsigned(Kind)); return Result; } - - bool isDeclOpenCLKernel() const { - if (auto FD = dyn_cast(getDecl())) - return FD->hasAttr(); - return false; - } }; } // namespace clang diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp index 0e4d69392e8c7..83116ecc0f47b 100644 --- a/clang/lib/AST/Decl.cpp +++ b/clang/lib/AST/Decl.cpp @@ -5468,6 +5468,10 @@ FunctionDecl *FunctionDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) { /*TrailingRequiresClause=*/{}); } +bool FunctionDecl::isReferenceableKernel() const { + return hasAttr() || hasAttr(); +} + BlockDecl *BlockDecl::Create(ASTContext &C, DeclContext *DC, SourceLocation L) { return new (C, DC) BlockDecl(DC, L); } diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp index 7af3c683ea805..4deed08d693ac 100644 --- a/clang/lib/AST/Expr.cpp +++ b/clang/lib/AST/Expr.cpp @@ -695,10 +695,9 @@ std::string PredefinedExpr::ComputeName(PredefinedIdentKind IK, GD = GlobalDecl(CD, Ctor_Base); else if (const CXXDestructorDecl *DD = dyn_cast(ND)) GD = GlobalDecl(DD, Dtor_Base); - else if (ND->hasAttr() || - ND->hasAttr()) - GD = GlobalDecl(cast(ND)); - else + else if (auto FD = dyn_cast(ND)) { + GD = FD->isReferenceableKernel() ? GlobalDecl(FD) : GlobalDecl(ND); + } else GD = GlobalDecl(ND); MC->mangleName(GD, Out); diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp index 29c0bdddca7a0..eb25b19bbdc74 100644 --- a/clang/lib/AST/ItaniumMangle.cpp +++ b/clang/lib/AST/ItaniumMangle.cpp @@ -1791,7 +1791,7 @@ void CXXNameMangler::mangleOCLDeviceStubName(const IdentifierInfo *II) { // ::= [n] // ::= StringRef OCLDeviceStubNamePrefix = "__clang_ocl_kern_imp_"; - Out << II->getLength() + OCLDeviceStubNamePrefix.size() - 1 + Out << II->getLength() + OCLDeviceStubNamePrefix.size() << OCLDeviceStubNamePrefix << II->getName(); } diff --git a/clang/lib/AST/Mangle.cpp b/clang/lib/AST/Mangle.cpp index 96ae823bf55e0..141957c1cdce0 100644 --- a/clang/lib/AST/Mangle.cpp +++ b/clang/lib/AST/Mangle.cpp @@ -540,9 +540,9 @@ class ASTNameGenerator::Implementation { GD = GlobalDecl(CtorD, Ctor_Complete); else if (const auto *DtorD = dyn_cast(D)) GD = GlobalDecl(DtorD, Dtor_Complete); - else if (D->hasAttr() || D->hasAttr()) - GD = GlobalDecl(cast(D)); - else + else if (const FunctionDecl *FD = dyn_cast(D)) { + GD = FD->isReferenceableKernel() ? GlobalDecl(FD) : GlobalDecl(D); + } else GD = GlobalDecl(D); MC->mangleName(GD, OS); return false; diff --git a/clang/lib/CodeGen/CGBlocks.cpp b/clang/lib/CodeGen/CGBlocks.cpp index b871c31a039d3..faef6a5fbe1f5 100644 --- a/clang/lib/CodeGen/CGBlocks.cpp +++ b/clang/lib/CodeGen/CGBlocks.cpp @@ -48,7 +48,7 @@ CGBlockInfo::CGBlockInfo(const BlockDecl *block, StringRef name) BlockByrefHelpers::~BlockByrefHelpers() {} /// Build the given block as a global block. -static llvm::Constant *buildGlobalBlock(CodeGenModule &CGM, GlobalDecl GD, +static llvm::Constant *buildGlobalBlock(CodeGenModule &CGM, const CGBlockInfo &blockInfo, llvm::Constant *blockFn); @@ -1085,10 +1085,8 @@ llvm::Value *CodeGenFunction::EmitBlockLiteral(const CGBlockInfo &blockInfo) { blockAddr.getPointer(), ConvertType(blockInfo.getBlockExpr()->getType())); if (IsOpenCL) { - CGM.getOpenCLRuntime().recordBlockInfo( - blockInfo.BlockExpression, InvokeFn, result, blockInfo.StructureType, - CurGD && CurGD.isDeclOpenCLKernel() && - (CurGD.getKernelReferenceKind() == KernelReferenceKind::Kernel)); + CGM.getOpenCLRuntime().recordBlockInfo(blockInfo.BlockExpression, InvokeFn, + result, blockInfo.StructureType); } return result; @@ -1266,7 +1264,7 @@ CodeGenModule::GetAddrOfGlobalBlock(const BlockExpr *BE, return getAddrOfGlobalBlockIfEmitted(BE); } -static llvm::Constant *buildGlobalBlock(CodeGenModule &CGM, GlobalDecl GD, +static llvm::Constant *buildGlobalBlock(CodeGenModule &CGM, const CGBlockInfo &blockInfo, llvm::Constant *blockFn) { assert(blockInfo.CanBeGlobal); @@ -1359,9 +1357,7 @@ static llvm::Constant *buildGlobalBlock(CodeGenModule &CGM, GlobalDecl GD, CGM.getOpenCLRuntime().recordBlockInfo( blockInfo.BlockExpression, cast(blockFn->stripPointerCasts()), Result, - literal->getValueType(), - GD && GD.isDeclOpenCLKernel() && - (GD.getKernelReferenceKind() == KernelReferenceKind::Kernel)); + literal->getValueType()); return Result; } @@ -1470,7 +1466,7 @@ llvm::Function *CodeGenFunction::GenerateBlockFunction( auto GenVoidPtrTy = getContext().getLangOpts().OpenCL ? CGM.getOpenCLRuntime().getGenericVoidPointerType() : VoidPtrTy; - buildGlobalBlock(CGM, CurGD, blockInfo, + buildGlobalBlock(CGM, blockInfo, llvm::ConstantExpr::getPointerCast(fn, GenVoidPtrTy)); } diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index 55e1fdb5ff389..6f5ccaa2f607d 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -499,7 +499,9 @@ CodeGenTypes::arrangeCXXConstructorCall(const CallArgList &args, /// Arrange the argument and result information for the declaration or /// definition of the given function. const CGFunctionInfo & -CodeGenTypes::arrangeFunctionDeclaration(const FunctionDecl *FD) { +CodeGenTypes::arrangeFunctionDeclaration(const GlobalDecl GD) { + const FunctionDecl *FD = dyn_cast(GD.getDecl()); + assert(FD && "GD must contain FunctionDecl"); if (const CXXMethodDecl *MD = dyn_cast(FD)) if (MD->isImplicitObjectMemberFunction()) return arrangeCXXMethodDeclaration(MD); @@ -509,6 +511,13 @@ CodeGenTypes::arrangeFunctionDeclaration(const FunctionDecl *FD) { assert(isa(FTy)); setCUDAKernelCallingConvention(FTy, CGM, FD); + if (FD->hasAttr() && + GD.getKernelReferenceKind() == KernelReferenceKind::Stub) { + const FunctionType *FT = FTy->getAs(); + CGM.getTargetCodeGenInfo().setOCLKernelStubCallingConvention(FT); + FTy = FT->getCanonicalTypeUnqualified(); + } + // When declaring a function without a prototype, always use a // non-variadic type. if (CanQual noProto = FTy.getAs()) { @@ -581,13 +590,11 @@ CodeGenTypes::arrangeUnprototypedObjCMessageSend(QualType returnType, const CGFunctionInfo & CodeGenTypes::arrangeGlobalDeclaration(GlobalDecl GD) { // FIXME: Do we need to handle ObjCMethodDecl? - const FunctionDecl *FD = cast(GD.getDecl()); - if (isa(GD.getDecl()) || isa(GD.getDecl())) return arrangeCXXStructorDeclaration(GD); - return arrangeFunctionDeclaration(FD); + return arrangeFunctionDeclaration(GD); } /// Arrange a thunk that takes 'this' as the first parameter followed by @@ -2383,15 +2390,6 @@ void CodeGenModule::ConstructAttributeList(StringRef Name, // Collect function IR attributes from the CC lowering. // We'll collect the paramete and result attributes later. CallingConv = FI.getEffectiveCallingConvention(); - GlobalDecl GD = CalleeInfo.getCalleeDecl(); - const Decl *TargetDecl = CalleeInfo.getCalleeDecl().getDecl(); - if (TargetDecl) { - if (auto FD = dyn_cast(TargetDecl)) { - if (FD->hasAttr() && - GD.getKernelReferenceKind() == KernelReferenceKind::Stub) - CallingConv = llvm::CallingConv::C; - } - } if (FI.isNoReturn()) FuncAttrs.addAttribute(llvm::Attribute::NoReturn); if (FI.isCmseNSCall()) @@ -2400,6 +2398,7 @@ void CodeGenModule::ConstructAttributeList(StringRef Name, // Collect function IR attributes from the callee prototype if we have one. AddAttributesFromFunctionProtoType(getContext(), FuncAttrs, CalleeInfo.getCalleeFunctionProtoType()); + const Decl *TargetDecl = CalleeInfo.getCalleeDecl().getDecl(); // Attach assumption attributes to the declaration. If this is a call // site, attach assumptions from the caller to the call as well. diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index 1a5c5f2cf2d7f..e39b5aad3bacf 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -5752,6 +5752,12 @@ static CGCallee EmitDirectCallee(CodeGenFunction &CGF, GlobalDecl GD) { return CGCallee::forDirect(CalleePtr, GD); } +static GlobalDecl getGlobalDeclForDirectCall(const FunctionDecl *FD) { + if (FD->hasAttr()) + return GlobalDecl(FD, KernelReferenceKind::Stub); + return GlobalDecl(FD); +} + CGCallee CodeGenFunction::EmitCallee(const Expr *E) { E = E->IgnoreParens(); @@ -5765,10 +5771,7 @@ CGCallee CodeGenFunction::EmitCallee(const Expr *E) { // Resolve direct calls. } else if (auto DRE = dyn_cast(E)) { if (auto FD = dyn_cast(DRE->getDecl())) { - auto CalleeDecl = FD->hasAttr() - ? GlobalDecl(FD, KernelReferenceKind::Stub) - : FD; - return EmitDirectCallee(*this, CalleeDecl); + return EmitDirectCallee(*this, getGlobalDeclForDirectCall(FD)); } } else if (auto ME = dyn_cast(E)) { if (auto FD = dyn_cast(ME->getMemberDecl())) { @@ -6137,6 +6140,12 @@ RValue CodeGenFunction::EmitCall(QualType CalleeType, const auto *FnType = cast(PointeeType); + if (auto FD = dyn_cast_or_null(TargetDecl)) { + if (FD->hasAttr()) { + CGM.getTargetCodeGenInfo().setOCLKernelStubCallingConvention(FnType); + } + } + // If we are checking indirect calls and this call is indirect, check that the // function pointer is a member of the bit set for the function type. if (SanOpts.has(SanitizerKind::CFIICall) && diff --git a/clang/lib/CodeGen/CGOpenCLRuntime.cpp b/clang/lib/CodeGen/CGOpenCLRuntime.cpp index 4481c06d5eb2a..9f8ff488755ec 100644 --- a/clang/lib/CodeGen/CGOpenCLRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenCLRuntime.cpp @@ -126,12 +126,8 @@ static const BlockExpr *getBlockExpr(const Expr *E) { /// corresponding block expression. void CGOpenCLRuntime::recordBlockInfo(const BlockExpr *E, llvm::Function *InvokeF, - llvm::Value *Block, llvm::Type *BlockTy, - bool isBlkExprInOCLKern) { - - assert((!EnqueuedBlockMap.contains(E) || - EnqueuedBlockMap[E].isBlkExprInOCLKern != isBlkExprInOCLKern) && - "Block expression emitted twice"); + llvm::Value *Block, llvm::Type *BlockTy) { + assert(!EnqueuedBlockMap.contains(E) && "Block expression emitted twice"); assert(isa(InvokeF) && "Invalid invoke function"); assert(Block->getType()->isPointerTy() && "Invalid block literal type"); EnqueuedBlockInfo &BlockInfo = EnqueuedBlockMap[E]; diff --git a/clang/lib/CodeGen/CGOpenCLRuntime.h b/clang/lib/CodeGen/CGOpenCLRuntime.h index 78bb5980cd87d..34613c3516f37 100644 --- a/clang/lib/CodeGen/CGOpenCLRuntime.h +++ b/clang/lib/CodeGen/CGOpenCLRuntime.h @@ -46,7 +46,6 @@ class CGOpenCLRuntime { llvm::Value *KernelHandle; /// Enqueued block kernel reference. llvm::Value *BlockArg; /// The first argument to enqueued block kernel. llvm::Type *BlockTy; /// Type of the block argument. - bool isBlkExprInOCLKern; /// Does the BlockExpr reside in an OpenCL Kernel. }; /// Maps block expression to block information. llvm::DenseMap EnqueuedBlockMap; @@ -94,8 +93,7 @@ class CGOpenCLRuntime { /// \param InvokeF invoke function emitted for the block expression. /// \param Block block literal emitted for the block expression. void recordBlockInfo(const BlockExpr *E, llvm::Function *InvokeF, - llvm::Value *Block, llvm::Type *BlockTy, - bool isBlkExprInOCLKern); + llvm::Value *Block, llvm::Type *BlockTy); /// \return LLVM block invoke function emitted for an expression derived from /// the block expression. diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index b55003b9b0bbb..650922dcfac3f 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -1595,6 +1595,26 @@ void CodeGenFunction::GenerateCode(GlobalDecl GD, llvm::Function *Fn, // Implicit copy-assignment gets the same special treatment as implicit // copy-constructors. emitImplicitAssignmentOperatorBody(Args); + } else if (FD->hasAttr() && + GD.getKernelReferenceKind() == KernelReferenceKind::Kernel) { + CallArgList CallArgs; + for (unsigned i = 0; i < Args.size(); ++i) { + Address ArgAddr = GetAddrOfLocalVar(Args[i]); + QualType ArgQualType = Args[i]->getType(); + RValue ArgRValue = convertTempToRValue(ArgAddr, ArgQualType, Loc); + CallArgs.add(ArgRValue, ArgQualType); + } + GlobalDecl GDStub = GlobalDecl(FD, KernelReferenceKind::Stub); + const FunctionType *FT = cast(FD->getType()); + CGM.getTargetCodeGenInfo().setOCLKernelStubCallingConvention(FT); + const CGFunctionInfo &FnInfo = CGM.getTypes().arrangeFreeFunctionCall( + CallArgs, FT, /*ChainCall=*/false); + llvm::FunctionType *FTy = CGM.getTypes().GetFunctionType(FnInfo); + llvm::Constant *GDStubFunctionPointer = + CGM.getRawFunctionPointer(GDStub, FTy); + CGCallee GDStubCallee = CGCallee::forDirect(GDStubFunctionPointer, GDStub); + EmitCall(FnInfo, GDStubCallee, ReturnValueSlot(), CallArgs, nullptr, false, + Loc); // set IsMustTail=true? } else if (Body) { EmitFunctionBody(Body); } else diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 726414aed49f4..8c0a2896a894f 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -4864,6 +4864,11 @@ CodeGenModule::GetAddrOfFunction(GlobalDecl GD, llvm::Type *Ty, bool ForVTable, if (!Ty) { const auto *FD = cast(GD.getDecl()); Ty = getTypes().ConvertType(FD->getType()); + if (FD->hasAttr() && + GD.getKernelReferenceKind() == KernelReferenceKind::Stub) { + const CGFunctionInfo &FI = getTypes().arrangeGlobalDeclaration(GD); + Ty = getTypes().GetFunctionType(FI); + } } // Devirtualized destructor calls may come through here instead of via @@ -6143,6 +6148,17 @@ void CodeGenModule::EmitGlobalFunctionDefinition(GlobalDecl GD, CodeGenFunction(*this).GenerateCode(GD, Fn, FI); setNonAliasAttributes(GD, Fn); + + if (D->hasAttr()) { + if (GD.getKernelReferenceKind() == KernelReferenceKind::Stub) { + if (Fn->hasFnAttribute(llvm::Attribute::NoInline)) + Fn->removeFnAttr(llvm::Attribute::NoInline); + if (Fn->hasFnAttribute(llvm::Attribute::InlineHint)) + Fn->removeFnAttr(llvm::Attribute::InlineHint); + Fn->addFnAttr(llvm::Attribute::AlwaysInline); + } + } + SetLLVMFunctionAttributesForDefinition(D, Fn); if (const ConstructorAttr *CA = D->getAttr()) diff --git a/clang/lib/CodeGen/CodeGenTypes.h b/clang/lib/CodeGen/CodeGenTypes.h index 5aebf9a212237..307048bcc510d 100644 --- a/clang/lib/CodeGen/CodeGenTypes.h +++ b/clang/lib/CodeGen/CodeGenTypes.h @@ -207,7 +207,7 @@ class CodeGenTypes { /// Free functions are functions that are compatible with an ordinary /// C function pointer type. - const CGFunctionInfo &arrangeFunctionDeclaration(const FunctionDecl *FD); + const CGFunctionInfo &arrangeFunctionDeclaration(const GlobalDecl GD); const CGFunctionInfo &arrangeFreeFunctionCall(const CallArgList &Args, const FunctionType *Ty, bool ChainCall); diff --git a/clang/lib/CodeGen/TargetInfo.h b/clang/lib/CodeGen/TargetInfo.h index 5df19fbef1e5b..9149e2fe8a0ab 100644 --- a/clang/lib/CodeGen/TargetInfo.h +++ b/clang/lib/CodeGen/TargetInfo.h @@ -400,7 +400,8 @@ class TargetCodeGenInfo { virtual bool shouldEmitDWARFBitFieldSeparators() const { return false; } virtual void setCUDAKernelCallingConvention(const FunctionType *&FT) const {} - + virtual void + setOCLKernelStubCallingConvention(const FunctionType *&FT) const {} /// Return the device-side type for the CUDA device builtin surface type. virtual llvm::Type *getCUDADeviceBuiltinSurfaceDeviceType() const { // By default, no change from the original one. diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index db2a2c5740646..8e92c591671f8 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -331,6 +331,8 @@ class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo { bool shouldEmitStaticExternCAliases() const override; bool shouldEmitDWARFBitFieldSeparators() const override; void setCUDAKernelCallingConvention(const FunctionType *&FT) const override; + void + setOCLKernelStubCallingConvention(const FunctionType *&FT) const override; }; } @@ -592,6 +594,12 @@ getAMDGPURuntimeHandleType(llvm::LLVMContext &C, "block.runtime.handle.t"); } +void AMDGPUTargetCodeGenInfo::setOCLKernelStubCallingConvention( + const FunctionType *&FT) const { + FT = getABIInfo().getContext().adjustFunctionType( + FT, FT->getExtInfo().withCallingConv(CC_C)); +} + /// Create an OpenCL kernel for an enqueued block. /// /// The type of the first argument (the block literal) is the struct type diff --git a/clang/lib/CodeGen/Targets/SPIR.cpp b/clang/lib/CodeGen/Targets/SPIR.cpp index 225d9dfbd980b..f35c124f50aa0 100644 --- a/clang/lib/CodeGen/Targets/SPIR.cpp +++ b/clang/lib/CodeGen/Targets/SPIR.cpp @@ -58,6 +58,8 @@ class CommonSPIRTargetCodeGenInfo : public TargetCodeGenInfo { llvm::Type *getSPIRVImageTypeFromHLSLResource( const HLSLAttributedResourceType::Attributes &attributes, llvm::Type *ElementType, llvm::LLVMContext &Ctx) const; + void + setOCLKernelStubCallingConvention(const FunctionType *&FT) const override; }; class SPIRVTargetCodeGenInfo : public CommonSPIRTargetCodeGenInfo { public: @@ -230,6 +232,12 @@ void SPIRVTargetCodeGenInfo::setCUDAKernelCallingConvention( } } +void CommonSPIRTargetCodeGenInfo::setOCLKernelStubCallingConvention( + const FunctionType *&FT) const { + FT = getABIInfo().getContext().adjustFunctionType( + FT, FT->getExtInfo().withCallingConv(CC_SpirFunction)); +} + LangAS SPIRVTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM, const VarDecl *D) const { diff --git a/clang/lib/CodeGen/Targets/X86.cpp b/clang/lib/CodeGen/Targets/X86.cpp index b36a6e1396653..3799e6d7c2545 100644 --- a/clang/lib/CodeGen/Targets/X86.cpp +++ b/clang/lib/CodeGen/Targets/X86.cpp @@ -1495,6 +1495,12 @@ class X86_64TargetCodeGenInfo : public TargetCodeGenInfo { const FunctionDecl *Caller, const FunctionDecl *Callee, const CallArgList &Args, QualType ReturnType) const override; + + void + setOCLKernelStubCallingConvention(const FunctionType *&FT) const override { + FT = getABIInfo().getContext().adjustFunctionType( + FT, FT->getExtInfo().withCallingConv(CC_SpirFunction)); + } }; } // namespace diff --git a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl index effdeb9546800..a92648e01db12 100644 --- a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl +++ b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl @@ -46,7 +46,6 @@ struct LargeStructTwoMember { struct LargeStructOneMember g_s; #endif -// // X86-LABEL: define void @foo( // X86-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT4X4:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_MAT3X3:%.*]]) align 4 [[IN:%.*]]) #[[ATTR0:[0-9]+]] { // X86-NEXT: [[ENTRY:.*:]] @@ -106,134 +105,157 @@ Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) { return out; } -// // X86-LABEL: define spir_kernel void @ker( // X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { // X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4 +// X86-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4 +// X86-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4 +// X86-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4 // X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4 // X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4 -// X86-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4 -// X86-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4 // X86-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4 // X86-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4 -// X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4 -// X86-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i32 0 -// X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4 -// X86-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3]], ptr addrspace(1) [[TMP1]], i32 1 -// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i32 36, i1 false) -// X86-NEXT: call void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3:[0-9]+]] -// X86-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 64, i1 false) +// X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4 +// X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4 +// X86-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[IN_ADDR_I]], align 4 +// X86-NEXT: store ptr addrspace(1) [[TMP1]], ptr [[OUT_ADDR_I]], align 4 +// X86-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_I]], align 4 +// X86-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_I]], align 4 +// X86-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3]], ptr addrspace(1) [[TMP3]], i32 1 +// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 4 [[ARRAYIDX1_I]], i32 36, i1 false) +// X86-NEXT: call void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP_I]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4:[0-9]+]] +// X86-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[TMP2]], ptr align 4 [[TMP_I]], i32 64, i1 false) // X86-NEXT: ret void // // AMDGCN-LABEL: define dso_local amdgpu_kernel void @ker( // AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { // AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) // AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) // AMDGCN-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 // AMDGCN-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 -// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i64 0 -// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 -// AMDGCN-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1 -// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4 -// AMDGCN-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR3:[0-9]+]] -// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0 -// AMDGCN-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4 -// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false) +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[TMP0]], ptr addrspace(5) [[IN_ADDR_I]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[TMP1]], ptr addrspace(5) [[OUT_ADDR_I]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR_I]], align 8 +// AMDGCN-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR_I]], align 8 +// AMDGCN-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP3]], i64 1 +// AMDGCN-NEXT: [[TMP4:%.*]] = load [9 x i32], ptr addrspace(1) [[ARRAYIDX1_I]], align 4 +// AMDGCN-NEXT: [[CALL_I:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP4]]) #[[ATTR4:[0-9]+]] +// AMDGCN-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL_I]], 0 +// AMDGCN-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP_I]], align 4 +// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[TMP2]], ptr addrspace(5) align 4 [[TMP_I]], i64 64, i1 false) // AMDGCN-NEXT: ret void // // AMDGCN20-LABEL: define dso_local amdgpu_kernel void @ker( // AMDGCN20-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { // AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN20-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN20-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) // AMDGCN20-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN20-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN20-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) // AMDGCN20-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr // AMDGCN20-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr // AMDGCN20-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8 // AMDGCN20-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 -// AMDGCN20-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 -// AMDGCN20-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i64 0 -// AMDGCN20-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8 -// AMDGCN20-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1 -// AMDGCN20-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0 -// AMDGCN20-NEXT: [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4 -// AMDGCN20-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR3:[0-9]+]] -// AMDGCN20-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0 -// AMDGCN20-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0 -// AMDGCN20-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4 -// AMDGCN20-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false) +// AMDGCN20-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: [[IN_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR_I]] to ptr +// AMDGCN20-NEXT: [[OUT_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR_I]] to ptr +// AMDGCN20-NEXT: [[TMP_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[TMP_I]] to ptr +// AMDGCN20-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[IN_ADDR_ASCAST_I]], align 8 +// AMDGCN20-NEXT: store ptr addrspace(1) [[TMP1]], ptr [[OUT_ADDR_ASCAST_I]], align 8 +// AMDGCN20-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST_I]], align 8 +// AMDGCN20-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST_I]], align 8 +// AMDGCN20-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP3]], i64 1 +// AMDGCN20-NEXT: [[TMP4:%.*]] = load [9 x i32], ptr addrspace(1) [[ARRAYIDX1_I]], align 4 +// AMDGCN20-NEXT: [[CALL_I:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP4]]) #[[ATTR4:[0-9]+]] +// AMDGCN20-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL_I]], 0 +// AMDGCN20-NEXT: store [16 x i32] [[TMP5]], ptr [[TMP_ASCAST_I]], align 4 +// AMDGCN20-NEXT: call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 [[TMP2]], ptr align 4 [[TMP_ASCAST_I]], i64 64, i1 false) // AMDGCN20-NEXT: ret void // // SPIR-LABEL: define dso_local spir_kernel void @ker( // SPIR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META5]] !kernel_arg_type_qual [[META6:![0-9]+]] { // SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4 +// SPIR-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4 // SPIR-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4 // SPIR-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4 -// SPIR-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4 -// SPIR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4 // SPIR-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4 // SPIR-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4 -// SPIR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4 -// SPIR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i32 0 -// SPIR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4 -// SPIR-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3]], ptr addrspace(1) [[TMP1]], i32 1 -// SPIR-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i32 36, i1 false) -// SPIR-NEXT: call spir_func void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3:[0-9]+]] -// SPIR-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 64, i1 false) +// SPIR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4 +// SPIR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4 +// SPIR-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[IN_ADDR_I]], align 4 +// SPIR-NEXT: store ptr addrspace(1) [[TMP1]], ptr [[OUT_ADDR_I]], align 4 +// SPIR-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_I]], align 4 +// SPIR-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_I]], align 4 +// SPIR-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3]], ptr addrspace(1) [[TMP3]], i32 1 +// SPIR-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 4 [[ARRAYIDX1_I]], i32 36, i1 false) +// SPIR-NEXT: call spir_func void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP_I]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4:[0-9]+]] +// SPIR-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[TMP2]], ptr align 4 [[TMP_I]], i32 64, i1 false) // SPIR-NEXT: ret void // // AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @ker( // AMDGCN30-GVAR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { // AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) // AMDGCN30-GVAR-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN30-GVAR-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) // AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 // AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 -// AMDGCN30-GVAR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i64 0 -// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 -// AMDGCN30-GVAR-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1 -// AMDGCN30-GVAR-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0 -// AMDGCN30-GVAR-NEXT: [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4 -// AMDGCN30-GVAR-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR3:[0-9]+]] -// AMDGCN30-GVAR-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0 -// AMDGCN30-GVAR-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0 -// AMDGCN30-GVAR-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4 -// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false) +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[TMP0]], ptr addrspace(5) [[IN_ADDR_I]], align 8 +// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[TMP1]], ptr addrspace(5) [[OUT_ADDR_I]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR_I]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR_I]], align 8 +// AMDGCN30-GVAR-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP3]], i64 1 +// AMDGCN30-GVAR-NEXT: [[TMP4:%.*]] = load [9 x i32], ptr addrspace(1) [[ARRAYIDX1_I]], align 4 +// AMDGCN30-GVAR-NEXT: [[CALL_I:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP4]]) #[[ATTR4:[0-9]+]] +// AMDGCN30-GVAR-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL_I]], 0 +// AMDGCN30-GVAR-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP_I]], align 4 +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[TMP2]], ptr addrspace(5) align 4 [[TMP_I]], i64 64, i1 false) // AMDGCN30-GVAR-NEXT: ret void // // AMDGCN30-LABEL: define dso_local amdgpu_kernel void @ker( // AMDGCN30-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { // AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) // AMDGCN30-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN30-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN30-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) // AMDGCN30-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 // AMDGCN30-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// AMDGCN30-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 -// AMDGCN30-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i64 0 -// AMDGCN30-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 -// AMDGCN30-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1 -// AMDGCN30-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0 -// AMDGCN30-NEXT: [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4 -// AMDGCN30-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR3:[0-9]+]] -// AMDGCN30-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0 -// AMDGCN30-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0 -// AMDGCN30-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4 -// AMDGCN30-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false) +// AMDGCN30-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-NEXT: store ptr addrspace(1) [[TMP0]], ptr addrspace(5) [[IN_ADDR_I]], align 8 +// AMDGCN30-NEXT: store ptr addrspace(1) [[TMP1]], ptr addrspace(5) [[OUT_ADDR_I]], align 8 +// AMDGCN30-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR_I]], align 8 +// AMDGCN30-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR_I]], align 8 +// AMDGCN30-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP3]], i64 1 +// AMDGCN30-NEXT: [[TMP4:%.*]] = load [9 x i32], ptr addrspace(1) [[ARRAYIDX1_I]], align 4 +// AMDGCN30-NEXT: [[CALL_I:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP4]]) #[[ATTR4:[0-9]+]] +// AMDGCN30-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL_I]], 0 +// AMDGCN30-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP_I]], align 4 +// AMDGCN30-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[TMP2]], ptr addrspace(5) align 4 [[TMP_I]], i64 64, i1 false) // AMDGCN30-NEXT: ret void // kernel void ker(global Mat3X3 *in, global Mat4X4 *out) { out[0] = foo(in[1]); } -// // X86-LABEL: define void @foo_large( // X86-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_MAT32X32:%.*]]) align 4 [[IN:%.*]]) #[[ATTR0]] { // X86-NEXT: [[ENTRY:.*:]] @@ -249,7 +271,7 @@ kernel void ker(global Mat3X3 *in, global Mat4X4 *out) { // AMDGCN-NEXT: ret void // // AMDGCN20-LABEL: define dso_local void @foo_large( -// AMDGCN20-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN20-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { // AMDGCN20-NEXT: [[ENTRY:.*:]] // AMDGCN20-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5) // AMDGCN20-NEXT: [[IN:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr @@ -280,122 +302,153 @@ Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) { return out; } -// // X86-LABEL: define spir_kernel void @ker_large( // X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { // X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4 +// X86-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4 +// X86-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4 +// X86-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4 // X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4 // X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4 -// X86-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4 -// X86-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4 // X86-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4 // X86-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4 -// X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4 -// X86-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i32 0 -// X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4 -// X86-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i32 1 -// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i32 4096, i1 false) -// X86-NEXT: call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]] -// X86-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 16384, i1 false) +// X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4 +// X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4 +// X86-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[IN_ADDR_I]], align 4 +// X86-NEXT: store ptr addrspace(1) [[TMP1]], ptr [[OUT_ADDR_I]], align 4 +// X86-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_I]], align 4 +// X86-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_I]], align 4 +// X86-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP3]], i32 1 +// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 4 [[ARRAYIDX1_I]], i32 4096, i1 false) +// X86-NEXT: call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_I]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4]] +// X86-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[TMP2]], ptr align 4 [[TMP_I]], i32 16384, i1 false) // X86-NEXT: ret void // // AMDGCN-LABEL: define dso_local amdgpu_kernel void @ker_large( // AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { // AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) // AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) -// AMDGCN-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) // AMDGCN-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 // AMDGCN-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 -// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i64 0 -// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 -// AMDGCN-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1 -// AMDGCN-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false) -// AMDGCN-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]] -// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false) +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[TMP0]], ptr addrspace(5) [[IN_ADDR_I]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[TMP1]], ptr addrspace(5) [[OUT_ADDR_I]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR_I]], align 8 +// AMDGCN-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR_I]], align 8 +// AMDGCN-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP3]], i64 1 +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 4 [[ARRAYIDX1_I]], i64 4096, i1 false) +// AMDGCN-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_I]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4]] +// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[TMP2]], ptr addrspace(5) align 4 [[TMP_I]], i64 16384, i1 false) // AMDGCN-NEXT: ret void // // AMDGCN20-LABEL: define dso_local amdgpu_kernel void @ker_large( // AMDGCN20-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { // AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN20-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN20-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) +// AMDGCN20-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) // AMDGCN20-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN20-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN20-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) -// AMDGCN20-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) // AMDGCN20-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr // AMDGCN20-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr // AMDGCN20-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8 // AMDGCN20-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 -// AMDGCN20-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 -// AMDGCN20-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i64 0 -// AMDGCN20-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8 -// AMDGCN20-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1 -// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false) -// AMDGCN20-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]] -// AMDGCN20-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false) +// AMDGCN20-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: [[IN_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR_I]] to ptr +// AMDGCN20-NEXT: [[OUT_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR_I]] to ptr +// AMDGCN20-NEXT: [[TMP_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[TMP_I]] to ptr +// AMDGCN20-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[IN_ADDR_ASCAST_I]], align 8 +// AMDGCN20-NEXT: store ptr addrspace(1) [[TMP1]], ptr [[OUT_ADDR_ASCAST_I]], align 8 +// AMDGCN20-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST_I]], align 8 +// AMDGCN20-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST_I]], align 8 +// AMDGCN20-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP3]], i64 1 +// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 4 [[ARRAYIDX1_I]], i64 4096, i1 false) +// AMDGCN20-NEXT: call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_ASCAST_I]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4]] +// AMDGCN20-NEXT: call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 [[TMP2]], ptr align 4 [[TMP_ASCAST_I]], i64 16384, i1 false) // AMDGCN20-NEXT: ret void // // SPIR-LABEL: define dso_local spir_kernel void @ker_large( // SPIR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META7:![0-9]+]] !kernel_arg_base_type [[META7]] !kernel_arg_type_qual [[META6]] { // SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4 +// SPIR-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4 // SPIR-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4 // SPIR-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4 -// SPIR-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4 -// SPIR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4 // SPIR-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4 // SPIR-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4 -// SPIR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4 -// SPIR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i32 0 -// SPIR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4 -// SPIR-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i32 1 -// SPIR-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i32 4096, i1 false) -// SPIR-NEXT: call spir_func void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]] -// SPIR-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 16384, i1 false) +// SPIR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4 +// SPIR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4 +// SPIR-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[IN_ADDR_I]], align 4 +// SPIR-NEXT: store ptr addrspace(1) [[TMP1]], ptr [[OUT_ADDR_I]], align 4 +// SPIR-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_I]], align 4 +// SPIR-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_I]], align 4 +// SPIR-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP3]], i32 1 +// SPIR-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 4 [[ARRAYIDX1_I]], i32 4096, i1 false) +// SPIR-NEXT: call spir_func void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_I]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4]] +// SPIR-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[TMP2]], ptr align 4 [[TMP_I]], i32 16384, i1 false) // SPIR-NEXT: ret void // // AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @ker_large( // AMDGCN30-GVAR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { // AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) // AMDGCN30-GVAR-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN30-GVAR-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) // AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 // AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 -// AMDGCN30-GVAR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i64 0 -// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 -// AMDGCN30-GVAR-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1 -// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false) -// AMDGCN30-GVAR-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]] -// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false) +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[TMP0]], ptr addrspace(5) [[IN_ADDR_I]], align 8 +// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[TMP1]], ptr addrspace(5) [[OUT_ADDR_I]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR_I]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR_I]], align 8 +// AMDGCN30-GVAR-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP3]], i64 1 +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 4 [[ARRAYIDX1_I]], i64 4096, i1 false) +// AMDGCN30-GVAR-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_I]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4]] +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[TMP2]], ptr addrspace(5) align 4 [[TMP_I]], i64 16384, i1 false) // AMDGCN30-GVAR-NEXT: ret void // // AMDGCN30-LABEL: define dso_local amdgpu_kernel void @ker_large( // AMDGCN30-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { // AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) +// AMDGCN30-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) // AMDGCN30-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN30-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN30-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) -// AMDGCN30-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) // AMDGCN30-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 // AMDGCN30-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// AMDGCN30-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 -// AMDGCN30-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i64 0 -// AMDGCN30-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 -// AMDGCN30-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1 -// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false) -// AMDGCN30-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]] -// AMDGCN30-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false) +// AMDGCN30-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-NEXT: store ptr addrspace(1) [[TMP0]], ptr addrspace(5) [[IN_ADDR_I]], align 8 +// AMDGCN30-NEXT: store ptr addrspace(1) [[TMP1]], ptr addrspace(5) [[OUT_ADDR_I]], align 8 +// AMDGCN30-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR_I]], align 8 +// AMDGCN30-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR_I]], align 8 +// AMDGCN30-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP3]], i64 1 +// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 4 [[ARRAYIDX1_I]], i64 4096, i1 false) +// AMDGCN30-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_I]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4]] +// AMDGCN30-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[TMP2]], ptr addrspace(5) align 4 [[TMP_I]], i64 16384, i1 false) // AMDGCN30-NEXT: ret void // kernel void ker_large(global Mat32X32 *in, global Mat64X64 *out) { out[0] = foo_large(in[1]); } -// // X86-LABEL: define void @FuncOneMember( // X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { // X86-NEXT: [[ENTRY:.*:]] @@ -476,7 +529,6 @@ void FuncOneMember(struct StructOneMember u) { u.x = (int2)(0, 0); } -// // X86-LABEL: define void @FuncOneLargeMember( // X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { // X86-NEXT: [[ENTRY:.*:]] @@ -560,12 +612,13 @@ void FuncOneLargeMember(struct LargeStructOneMember u) { } #if (__OPENCL_C_VERSION__ == 200) || (__OPENCL_C_VERSION__ >= 300 && defined(__opencl_c_program_scope_global_variables)) + // AMDGCN20-LABEL: define dso_local void @test_indirect_arg_globl( // AMDGCN20-SAME: ) #[[ATTR0]] { // AMDGCN20-NEXT: [[ENTRY:.*:]] // AMDGCN20-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) // AMDGCN20-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(1) align 8 @g_s, i64 800, i1 false) -// AMDGCN20-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] +// AMDGCN20-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] // AMDGCN20-NEXT: ret void // // AMDGCN30-GVAR-LABEL: define dso_local void @test_indirect_arg_globl( @@ -573,7 +626,7 @@ void FuncOneLargeMember(struct LargeStructOneMember u) { // AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] // AMDGCN30-GVAR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) // AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(1) align 8 @g_s, i64 800, i1 false) -// AMDGCN30-GVAR-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] +// AMDGCN30-GVAR-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] // AMDGCN30-GVAR-NEXT: ret void // void test_indirect_arg_globl(void) { @@ -581,53 +634,52 @@ void test_indirect_arg_globl(void) { } #endif -// // X86-LABEL: define spir_kernel void @test_indirect_arg_local( // X86-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { // X86-NEXT: [[ENTRY:.*:]] -// X86-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 4 -// X86-NEXT: call void @llvm.memcpy.p0.p3.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i32 800, i1 false) -// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]] +// X86-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 4 +// X86-NEXT: call void @llvm.memcpy.p0.p3.i32(ptr align 4 [[BYVAL_TEMP_I]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i32 800, i1 false) +// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4]] // X86-NEXT: ret void // // AMDGCN-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local( // AMDGCN-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { // AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) -// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] +// AMDGCN-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP_I]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) +// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP_I]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // // AMDGCN20-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local( // AMDGCN20-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { // AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) -// AMDGCN20-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] +// AMDGCN20-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP_I]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) +// AMDGCN20-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP_I]]) #[[ATTR4]] // AMDGCN20-NEXT: ret void // // SPIR-LABEL: define dso_local spir_kernel void @test_indirect_arg_local( // SPIR-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META8:![0-9]+]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META8]] { // SPIR-NEXT: [[ENTRY:.*:]] -// SPIR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8 -// SPIR-NEXT: call void @llvm.memcpy.p0.p3.i32(ptr align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i32 800, i1 false) -// SPIR-NEXT: call spir_func void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] +// SPIR-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8 +// SPIR-NEXT: call void @llvm.memcpy.p0.p3.i32(ptr align 8 [[BYVAL_TEMP_I]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i32 800, i1 false) +// SPIR-NEXT: call spir_func void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP_I]]) #[[ATTR4]] // SPIR-NEXT: ret void // // AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local( // AMDGCN30-GVAR-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { // AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] -// AMDGCN30-GVAR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) -// AMDGCN30-GVAR-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] +// AMDGCN30-GVAR-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP_I]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) +// AMDGCN30-GVAR-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP_I]]) #[[ATTR4]] // AMDGCN30-GVAR-NEXT: ret void // // AMDGCN30-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local( // AMDGCN30-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { // AMDGCN30-NEXT: [[ENTRY:.*:]] -// AMDGCN30-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) -// AMDGCN30-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] +// AMDGCN30-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP_I]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) +// AMDGCN30-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP_I]]) #[[ATTR4]] // AMDGCN30-NEXT: ret void // kernel void test_indirect_arg_local(void) { @@ -635,19 +687,18 @@ kernel void test_indirect_arg_local(void) { FuncOneLargeMember(l_s); } -// // X86-LABEL: define void @test_indirect_arg_private( // X86-SAME: ) #[[ATTR0]] { // X86-NEXT: [[ENTRY:.*:]] // X86-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8 -// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[P_S]]) #[[ATTR3]] +// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[P_S]]) #[[ATTR4]] // X86-NEXT: ret void // // AMDGCN-LABEL: define dso_local void @test_indirect_arg_private( // AMDGCN-SAME: ) #[[ATTR0]] { // AMDGCN-NEXT: [[ENTRY:.*:]] // AMDGCN-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR3]] +// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // // AMDGCN20-LABEL: define dso_local void @test_indirect_arg_private( @@ -657,28 +708,28 @@ kernel void test_indirect_arg_local(void) { // AMDGCN20-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) // AMDGCN20-NEXT: [[P_S_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P_S]] to ptr // AMDGCN20-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr align 8 [[P_S_ASCAST]], i64 800, i1 false) -// AMDGCN20-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] +// AMDGCN20-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] // AMDGCN20-NEXT: ret void // // SPIR-LABEL: define dso_local spir_func void @test_indirect_arg_private( // SPIR-SAME: ) #[[ATTR0]] { // SPIR-NEXT: [[ENTRY:.*:]] // SPIR-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8 -// SPIR-NEXT: call spir_func void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR3]] +// SPIR-NEXT: call spir_func void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR4]] // SPIR-NEXT: ret void // // AMDGCN30-GVAR-LABEL: define dso_local void @test_indirect_arg_private( // AMDGCN30-GVAR-SAME: ) #[[ATTR0]] { // AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] // AMDGCN30-GVAR-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR3]] +// AMDGCN30-GVAR-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR4]] // AMDGCN30-GVAR-NEXT: ret void // // AMDGCN30-LABEL: define dso_local void @test_indirect_arg_private( // AMDGCN30-SAME: ) #[[ATTR0]] { // AMDGCN30-NEXT: [[ENTRY:.*:]] // AMDGCN30-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN30-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR3]] +// AMDGCN30-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR4]] // AMDGCN30-NEXT: ret void // void test_indirect_arg_private(void) { @@ -686,201 +737,247 @@ void test_indirect_arg_private(void) { FuncOneLargeMember(p_s); } -// // X86-LABEL: define spir_kernel void @KernelOneMember( // X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { // X86-NEXT: [[ENTRY:.*:]] -// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR3]] +// X86-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8 +// X86-NEXT: [[U1:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[U1]], ptr align 1 [[U]], i64 8, i1 false) +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I]], ptr align 4 [[U1]], i32 8, i1 false) +// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U_I]]) #[[ATTR4]] // X86-NEXT: ret void // // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMember( // AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { // AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 // AMDGCN-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 // AMDGCN-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 // AMDGCN-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8 -// AMDGCN-NEXT: call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR3]] +// AMDGCN-NEXT: store <2 x i32> [[TMP0]], ptr addrspace(5) [[U_I]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[U_I]], align 8 +// AMDGCN-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // // AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelOneMember( // AMDGCN20-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { // AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8, addrspace(5) // AMDGCN20-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr // AMDGCN20-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 // AMDGCN20-NEXT: store <2 x i32> [[U_COERCE]], ptr [[COERCE_DIVE]], align 8 // AMDGCN20-NEXT: [[COERCE_DIVE2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 // AMDGCN20-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[COERCE_DIVE2]], align 8 -// AMDGCN20-NEXT: call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR3]] +// AMDGCN20-NEXT: [[U1_I:%.*]] = addrspacecast ptr addrspace(5) [[U_I]] to ptr +// AMDGCN20-NEXT: store <2 x i32> [[TMP0]], ptr [[U1_I]], align 8 +// AMDGCN20-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[U1_I]], align 8 +// AMDGCN20-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR4]] // AMDGCN20-NEXT: ret void // // SPIR-LABEL: define dso_local spir_kernel void @KernelOneMember( // SPIR-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META10:![0-9]+]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META11]] !kernel_arg_type_qual [[META12:![0-9]+]] { // SPIR-NEXT: [[ENTRY:.*:]] -// SPIR-NEXT: call spir_func void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]] +// SPIR-NEXT: [[U1:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8 +// SPIR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[U1]], ptr align 1 [[U]], i64 8, i1 false) +// SPIR-NEXT: call spir_func void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 8 [[U1]]) #[[ATTR4]] // SPIR-NEXT: ret void // // AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelOneMember( // AMDGCN30-GVAR-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { // AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] -// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8, addrspace(5) // AMDGCN30-GVAR-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 // AMDGCN30-GVAR-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 // AMDGCN30-GVAR-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 // AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8 -// AMDGCN30-GVAR-NEXT: call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR3]] +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP0]], ptr addrspace(5) [[U_I]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[U_I]], align 8 +// AMDGCN30-GVAR-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR4]] // AMDGCN30-GVAR-NEXT: ret void // // AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelOneMember( // AMDGCN30-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { // AMDGCN30-NEXT: [[ENTRY:.*:]] -// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8, addrspace(5) // AMDGCN30-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 // AMDGCN30-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 // AMDGCN30-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 // AMDGCN30-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8 -// AMDGCN30-NEXT: call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR3]] +// AMDGCN30-NEXT: store <2 x i32> [[TMP0]], ptr addrspace(5) [[U_I]], align 8 +// AMDGCN30-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[U_I]], align 8 +// AMDGCN30-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR4]] // AMDGCN30-NEXT: ret void // kernel void KernelOneMember(struct StructOneMember u) { FuncOneMember(u); } -// // X86-LABEL: define spir_kernel void @KernelOneMemberSpir( // X86-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { // X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[U_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4 +// X86-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 4 // X86-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 4 -// X86-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 4 // X86-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR]], align 4 // X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR]], align 4 -// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 8 [[TMP0]], i32 8, i1 false) -// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]] +// X86-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[U_ADDR_I]], align 4 +// X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[U_ADDR_I]], align 4 +// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 8 [[TMP1]], i32 8, i1 false) +// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4]] // X86-NEXT: ret void // // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir( // AMDGCN-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { // AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8 // AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8 -// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER:%.*]], ptr addrspace(1) [[TMP0]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[COERCE_DIVE]], align 8 -// AMDGCN-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR3]] +// AMDGCN-NEXT: store ptr addrspace(1) [[TMP0]], ptr addrspace(5) [[U_ADDR_I]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR_I]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(1) [[TMP1]], align 8 +// AMDGCN-NEXT: call void @FuncOneMember(<2 x i32> [[TMP2]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // // AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir( // AMDGCN20-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { // AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[U_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN20-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN20-NEXT: [[U_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[U_ADDR]] to ptr // AMDGCN20-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR_ASCAST]], align 8 // AMDGCN20-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR_ASCAST]], align 8 -// AMDGCN20-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER:%.*]], ptr addrspace(1) [[TMP0]], i32 0, i32 0 -// AMDGCN20-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[COERCE_DIVE]], align 8 -// AMDGCN20-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR3]] +// AMDGCN20-NEXT: [[U_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[U_ADDR_I]] to ptr +// AMDGCN20-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[U_ADDR_ASCAST_I]], align 8 +// AMDGCN20-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[U_ADDR_ASCAST_I]], align 8 +// AMDGCN20-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(1) [[TMP1]], align 8 +// AMDGCN20-NEXT: call void @FuncOneMember(<2 x i32> [[TMP2]]) #[[ATTR4]] // AMDGCN20-NEXT: ret void // // SPIR-LABEL: define dso_local spir_kernel void @KernelOneMemberSpir( // SPIR-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META13:![0-9]+]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META14:![0-9]+]] !kernel_arg_base_type [[META14]] !kernel_arg_type_qual [[META12]] { // SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: [[U_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8 // SPIR-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 4 -// SPIR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8 // SPIR-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR]], align 4 // SPIR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR]], align 4 -// SPIR-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 8 [[BYVAL_TEMP]], ptr addrspace(1) align 8 [[TMP0]], i32 8, i1 false) -// SPIR-NEXT: call spir_func void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] +// SPIR-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[U_ADDR_I]], align 4 +// SPIR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[U_ADDR_I]], align 4 +// SPIR-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 8 [[BYVAL_TEMP_I]], ptr addrspace(1) align 8 [[TMP1]], i32 8, i1 false) +// SPIR-NEXT: call spir_func void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP_I]]) #[[ATTR4]] // SPIR-NEXT: ret void // // AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir( // AMDGCN30-GVAR-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { // AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN30-GVAR-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8 // AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8 -// AMDGCN30-GVAR-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER:%.*]], ptr addrspace(1) [[TMP0]], i32 0, i32 0 -// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[COERCE_DIVE]], align 8 -// AMDGCN30-GVAR-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR3]] +// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[TMP0]], ptr addrspace(5) [[U_ADDR_I]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR_I]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(1) [[TMP1]], align 8 +// AMDGCN30-GVAR-NEXT: call void @FuncOneMember(<2 x i32> [[TMP2]]) #[[ATTR4]] // AMDGCN30-GVAR-NEXT: ret void // // AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir( // AMDGCN30-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { // AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[U_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN30-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN30-NEXT: store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8 // AMDGCN30-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8 -// AMDGCN30-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER:%.*]], ptr addrspace(1) [[TMP0]], i32 0, i32 0 -// AMDGCN30-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[COERCE_DIVE]], align 8 -// AMDGCN30-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR3]] +// AMDGCN30-NEXT: store ptr addrspace(1) [[TMP0]], ptr addrspace(5) [[U_ADDR_I]], align 8 +// AMDGCN30-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR_I]], align 8 +// AMDGCN30-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(1) [[TMP1]], align 8 +// AMDGCN30-NEXT: call void @FuncOneMember(<2 x i32> [[TMP2]]) #[[ATTR4]] // AMDGCN30-NEXT: ret void // kernel void KernelOneMemberSpir(global struct StructOneMember* u) { FuncOneMember(*u); } -// // X86-LABEL: define spir_kernel void @KernelLargeOneMember( // X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { // X86-NEXT: [[ENTRY:.*:]] -// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR3]] +// X86-NEXT: [[U_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8 +// X86-NEXT: [[U1:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[U1]], ptr align 1 [[U]], i64 800, i1 false) +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I]], ptr align 4 [[U1]], i32 800, i1 false) +// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U_I]]) #[[ATTR4]] // X86-NEXT: ret void // // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember( // AMDGCN-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { // AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 // AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 // AMDGCN-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 -// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]] +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U_I]], ptr addrspace(5) align 8 [[U]], i64 800, i1 false) +// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U_I]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // // AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember( // AMDGCN20-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { // AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[COERCE_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) // AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN20-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) // AMDGCN20-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr // AMDGCN20-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 // AMDGCN20-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 // AMDGCN20-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr [[TMP0]], align 8 -// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr align 8 [[U1]], i64 800, i1 false) -// AMDGCN20-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] +// AMDGCN20-NEXT: [[U1_ASCAST:%.*]] = addrspacecast ptr [[U1]] to ptr addrspace(5) +// AMDGCN20-NEXT: [[U_I:%.*]] = addrspacecast ptr addrspace(5) [[COERCE_I]] to ptr +// AMDGCN20-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 8 [[U_I]], ptr addrspace(5) align 8 [[U1_ASCAST]], i64 800, i1 false) +// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP_I]], ptr align 8 [[U_I]], i64 800, i1 false) +// AMDGCN20-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP_I]]) #[[ATTR4]] // AMDGCN20-NEXT: ret void // // SPIR-LABEL: define dso_local spir_kernel void @KernelLargeOneMember( // SPIR-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META12]] { // SPIR-NEXT: [[ENTRY:.*:]] -// SPIR-NEXT: call spir_func void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]] +// SPIR-NEXT: [[U1:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8 +// SPIR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[U1]], ptr align 1 [[U]], i64 800, i1 false) +// SPIR-NEXT: call spir_func void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U1]]) #[[ATTR4]] // SPIR-NEXT: ret void // // AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember( // AMDGCN30-GVAR-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { // AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) // AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) // AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 // AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 // AMDGCN30-GVAR-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 -// AMDGCN30-GVAR-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]] +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U_I]], ptr addrspace(5) align 8 [[U]], i64 800, i1 false) +// AMDGCN30-GVAR-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U_I]]) #[[ATTR4]] // AMDGCN30-GVAR-NEXT: ret void // // AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember( // AMDGCN30-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { // AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[U_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) // AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) // AMDGCN30-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 // AMDGCN30-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 // AMDGCN30-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 -// AMDGCN30-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]] +// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U_I]], ptr addrspace(5) align 8 [[U]], i64 800, i1 false) +// AMDGCN30-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U_I]]) #[[ATTR4]] // AMDGCN30-NEXT: ret void // kernel void KernelLargeOneMember(struct LargeStructOneMember u) { FuncOneLargeMember(u); } -// // X86-LABEL: define void @FuncTwoMember( // X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { // X86-NEXT: [[ENTRY:.*:]] @@ -969,7 +1066,6 @@ void FuncTwoMember(struct StructTwoMember u) { u.y = (int2)(0, 0); } -// // X86-LABEL: define void @FuncLargeTwoMember( // X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { // X86-NEXT: [[ENTRY:.*:]] @@ -1052,16 +1148,20 @@ void FuncLargeTwoMember(struct LargeStructTwoMember u) { u.y[0] = (int2)(0, 0); } -// // X86-LABEL: define spir_kernel void @KernelTwoMember( // X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { // X86-NEXT: [[ENTRY:.*:]] -// X86-NEXT: call void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR3]] +// X86-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8 +// X86-NEXT: [[U1:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[U1]], ptr align 1 [[U]], i64 16, i1 false) +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I]], ptr align 4 [[U1]], i32 16, i1 false) +// X86-NEXT: call void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U_I]]) #[[ATTR4]] // X86-NEXT: ret void // // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember( // AMDGCN-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { // AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 // AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0 @@ -1073,12 +1173,19 @@ void FuncLargeTwoMember(struct LargeStructTwoMember u) { // AMDGCN-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8 // AMDGCN-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 // AMDGCN-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8 -// AMDGCN-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR3]] +// AMDGCN-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(5) [[U_I]], align 8 +// AMDGCN-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U_I]], i32 0, i32 1 +// AMDGCN-NEXT: store <2 x i32> [[TMP7]], ptr addrspace(5) [[TMP8]], align 8 +// AMDGCN-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr addrspace(5) [[U_I]], align 8 +// AMDGCN-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U_I]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP11:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP10]], align 8 +// AMDGCN-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP9]], <2 x i32> [[TMP11]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // // AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember( // AMDGCN20-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { // AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN20-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr // AMDGCN20-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 @@ -1091,18 +1198,28 @@ void FuncLargeTwoMember(struct LargeStructTwoMember u) { // AMDGCN20-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr [[TMP4]], align 8 // AMDGCN20-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 // AMDGCN20-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8 -// AMDGCN20-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR3]] +// AMDGCN20-NEXT: [[U1_I:%.*]] = addrspacecast ptr addrspace(5) [[U_I]] to ptr +// AMDGCN20-NEXT: store <2 x i32> [[TMP5]], ptr [[U1_I]], align 8 +// AMDGCN20-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1_I]], i32 0, i32 1 +// AMDGCN20-NEXT: store <2 x i32> [[TMP7]], ptr [[TMP8]], align 8 +// AMDGCN20-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr [[U1_I]], align 8 +// AMDGCN20-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1_I]], i32 0, i32 1 +// AMDGCN20-NEXT: [[TMP11:%.*]] = load <2 x i32>, ptr [[TMP10]], align 8 +// AMDGCN20-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP9]], <2 x i32> [[TMP11]]) #[[ATTR4]] // AMDGCN20-NEXT: ret void // // SPIR-LABEL: define dso_local spir_kernel void @KernelTwoMember( // SPIR-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META12]] { // SPIR-NEXT: [[ENTRY:.*:]] -// SPIR-NEXT: call spir_func void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]] +// SPIR-NEXT: [[U1:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8 +// SPIR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[U1]], ptr align 1 [[U]], i64 16, i1 false) +// SPIR-NEXT: call spir_func void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 8 [[U1]]) #[[ATTR4]] // SPIR-NEXT: ret void // // AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember( // AMDGCN30-GVAR-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { // AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 // AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0 @@ -1114,12 +1231,19 @@ void FuncLargeTwoMember(struct LargeStructTwoMember u) { // AMDGCN30-GVAR-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8 // AMDGCN30-GVAR-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 // AMDGCN30-GVAR-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8 -// AMDGCN30-GVAR-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR3]] +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(5) [[U_I]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U_I]], i32 0, i32 1 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP7]], ptr addrspace(5) [[TMP8]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr addrspace(5) [[U_I]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U_I]], i32 0, i32 1 +// AMDGCN30-GVAR-NEXT: [[TMP11:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP10]], align 8 +// AMDGCN30-GVAR-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP9]], <2 x i32> [[TMP11]]) #[[ATTR4]] // AMDGCN30-GVAR-NEXT: ret void // // AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember( // AMDGCN30-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { // AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN30-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 // AMDGCN30-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0 @@ -1131,23 +1255,33 @@ void FuncLargeTwoMember(struct LargeStructTwoMember u) { // AMDGCN30-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8 // AMDGCN30-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 // AMDGCN30-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8 -// AMDGCN30-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR3]] +// AMDGCN30-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(5) [[U_I]], align 8 +// AMDGCN30-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U_I]], i32 0, i32 1 +// AMDGCN30-NEXT: store <2 x i32> [[TMP7]], ptr addrspace(5) [[TMP8]], align 8 +// AMDGCN30-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr addrspace(5) [[U_I]], align 8 +// AMDGCN30-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U_I]], i32 0, i32 1 +// AMDGCN30-NEXT: [[TMP11:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP10]], align 8 +// AMDGCN30-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP9]], <2 x i32> [[TMP11]]) #[[ATTR4]] // AMDGCN30-NEXT: ret void // kernel void KernelTwoMember(struct StructTwoMember u) { FuncTwoMember(u); } -// // X86-LABEL: define spir_kernel void @KernelLargeTwoMember( // X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { // X86-NEXT: [[ENTRY:.*:]] -// X86-NEXT: call void @FuncLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR3]] +// X86-NEXT: [[U_I:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 +// X86-NEXT: [[U1:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[U1]], ptr align 1 [[U]], i64 480, i1 false) +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I]], ptr align 4 [[U1]], i32 480, i1 false) +// X86-NEXT: call void @FuncLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[U_I]]) #[[ATTR4]] // X86-NEXT: ret void // // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember( // AMDGCN-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { // AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U_I:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 // AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0 @@ -1155,14 +1289,16 @@ kernel void KernelTwoMember(struct StructTwoMember u) { // AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 // AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1 // AMDGCN-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 -// AMDGCN-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]] +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U_I]], ptr addrspace(5) align 8 [[U]], i64 480, i1 false) +// AMDGCN-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U_I]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // // AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember( // AMDGCN20-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { // AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[COERCE_I:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN20-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN20-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr // AMDGCN20-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 // AMDGCN20-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0 @@ -1170,19 +1306,25 @@ kernel void KernelTwoMember(struct StructTwoMember u) { // AMDGCN20-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 // AMDGCN20-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1 // AMDGCN20-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr [[TMP2]], align 8 -// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr align 8 [[U1]], i64 480, i1 false) -// AMDGCN20-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] +// AMDGCN20-NEXT: [[U1_ASCAST:%.*]] = addrspacecast ptr [[U1]] to ptr addrspace(5) +// AMDGCN20-NEXT: [[U_I:%.*]] = addrspacecast ptr addrspace(5) [[COERCE_I]] to ptr +// AMDGCN20-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 8 [[U_I]], ptr addrspace(5) align 8 [[U1_ASCAST]], i64 480, i1 false) +// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP_I]], ptr align 8 [[U_I]], i64 480, i1 false) +// AMDGCN20-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[BYVAL_TEMP_I]]) #[[ATTR4]] // AMDGCN20-NEXT: ret void // // SPIR-LABEL: define dso_local spir_kernel void @KernelLargeTwoMember( // SPIR-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META12]] { // SPIR-NEXT: [[ENTRY:.*:]] -// SPIR-NEXT: call spir_func void @FuncLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]] +// SPIR-NEXT: [[U1:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 +// SPIR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[U1]], ptr align 1 [[U]], i64 480, i1 false) +// SPIR-NEXT: call spir_func void @FuncLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U1]]) #[[ATTR4]] // SPIR-NEXT: ret void // // AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember( // AMDGCN30-GVAR-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { // AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U_I:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 // AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0 @@ -1190,12 +1332,14 @@ kernel void KernelTwoMember(struct StructTwoMember u) { // AMDGCN30-GVAR-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 // AMDGCN30-GVAR-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1 // AMDGCN30-GVAR-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 -// AMDGCN30-GVAR-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]] +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U_I]], ptr addrspace(5) align 8 [[U]], i64 480, i1 false) +// AMDGCN30-GVAR-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U_I]]) #[[ATTR4]] // AMDGCN30-GVAR-NEXT: ret void // // AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember( // AMDGCN30-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { // AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[U_I:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN30-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 // AMDGCN30-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0 @@ -1203,12 +1347,14 @@ kernel void KernelTwoMember(struct StructTwoMember u) { // AMDGCN30-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 // AMDGCN30-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1 // AMDGCN30-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 -// AMDGCN30-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]] +// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U_I]], ptr addrspace(5) align 8 [[U]], i64 480, i1 false) +// AMDGCN30-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U_I]]) #[[ATTR4]] // AMDGCN30-NEXT: ret void // kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { FuncLargeTwoMember(u); } + //. // X86: [[META4]] = !{i32 1, i32 1} // X86: [[META5]] = !{!"none", !"none"} diff --git a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl index 2f8ba99a3e416..e25e060ea3ea9 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl @@ -65,24 +65,30 @@ Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) { // AMDGCN-LABEL: define dso_local amdgpu_kernel void @ker( // AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { // AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) // AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) // AMDGCN-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr // AMDGCN-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr // AMDGCN-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8 // AMDGCN-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 -// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 -// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i64 0 -// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8 -// AMDGCN-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1 -// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4 -// AMDGCN-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR3:[0-9]+]] -// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0 -// AMDGCN-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4 -// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false) +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: [[IN_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR_I]] to ptr +// AMDGCN-NEXT: [[OUT_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR_I]] to ptr +// AMDGCN-NEXT: [[TMP_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[TMP_I]] to ptr +// AMDGCN-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[IN_ADDR_ASCAST_I]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[TMP1]], ptr [[OUT_ADDR_ASCAST_I]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST_I]], align 8 +// AMDGCN-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST_I]], align 8 +// AMDGCN-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP3]], i64 1 +// AMDGCN-NEXT: [[TMP4:%.*]] = load [9 x i32], ptr addrspace(1) [[ARRAYIDX1_I]], align 4 +// AMDGCN-NEXT: [[CALL_I:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP4]]) #[[ATTR4:[0-9]+]] +// AMDGCN-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL_I]], 0 +// AMDGCN-NEXT: store [16 x i32] [[TMP5]], ptr [[TMP_ASCAST_I]], align 4 +// AMDGCN-NEXT: call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 [[TMP2]], ptr align 4 [[TMP_ASCAST_I]], i64 64, i1 false) // AMDGCN-NEXT: ret void // kernel void ker(global Mat3X3 *in, global Mat4X4 *out) { @@ -90,7 +96,7 @@ kernel void ker(global Mat3X3 *in, global Mat4X4 *out) { } // AMDGCN-LABEL: define dso_local void @foo_large( -// AMDGCN-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { // AMDGCN-NEXT: [[ENTRY:.*:]] // AMDGCN-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5) // AMDGCN-NEXT: [[IN:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr @@ -105,21 +111,29 @@ Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) { // AMDGCN-LABEL: define dso_local amdgpu_kernel void @ker_large( // AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { // AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) // AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) -// AMDGCN-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) // AMDGCN-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr // AMDGCN-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr // AMDGCN-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8 // AMDGCN-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 -// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 -// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i64 0 -// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8 -// AMDGCN-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1 -// AMDGCN-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false) -// AMDGCN-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]] -// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false) +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: [[IN_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR_I]] to ptr +// AMDGCN-NEXT: [[OUT_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR_I]] to ptr +// AMDGCN-NEXT: [[TMP_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[TMP_I]] to ptr +// AMDGCN-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[IN_ADDR_ASCAST_I]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[TMP1]], ptr [[OUT_ADDR_ASCAST_I]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST_I]], align 8 +// AMDGCN-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST_I]], align 8 +// AMDGCN-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP3]], i64 1 +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 4 [[ARRAYIDX1_I]], i64 4096, i1 false) +// AMDGCN-NEXT: call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_ASCAST_I]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4]] +// AMDGCN-NEXT: call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 [[TMP2]], ptr align 4 [[TMP_ASCAST_I]], i64 16384, i1 false) // AMDGCN-NEXT: ret void // kernel void ker_large(global Mat32X32 *in, global Mat64X64 *out) { @@ -170,7 +184,7 @@ void FuncOneLargeMember(struct LargeStructOneMember u) { // AMDGCN-NEXT: [[ENTRY:.*:]] // AMDGCN-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) // AMDGCN-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(1) align 8 @g_s, i64 800, i1 false) -// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] +// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // void test_indirect_arg_globl(void) { @@ -181,9 +195,9 @@ void test_indirect_arg_globl(void) { // AMDGCN-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local( // AMDGCN-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { // AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) -// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] +// AMDGCN-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP_I]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) +// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP_I]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // kernel void test_indirect_arg_local(void) { @@ -198,7 +212,7 @@ kernel void test_indirect_arg_local(void) { // AMDGCN-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[P_S_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P_S]] to ptr // AMDGCN-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr align 8 [[P_S_ASCAST]], i64 800, i1 false) -// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] +// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // void test_indirect_arg_private(void) { @@ -209,13 +223,17 @@ void test_indirect_arg_private(void) { // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMember( // AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { // AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr // AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 // AMDGCN-NEXT: store <2 x i32> [[U_COERCE]], ptr [[COERCE_DIVE]], align 8 // AMDGCN-NEXT: [[COERCE_DIVE2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 // AMDGCN-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[COERCE_DIVE2]], align 8 -// AMDGCN-NEXT: call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR3]] +// AMDGCN-NEXT: [[U1_I:%.*]] = addrspacecast ptr addrspace(5) [[U_I]] to ptr +// AMDGCN-NEXT: store <2 x i32> [[TMP0]], ptr [[U1_I]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[U1_I]], align 8 +// AMDGCN-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // kernel void KernelOneMember(struct StructOneMember u) { @@ -225,13 +243,16 @@ kernel void KernelOneMember(struct StructOneMember u) { // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir( // AMDGCN-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { // AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: [[U_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[U_ADDR]] to ptr // AMDGCN-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR_ASCAST]], align 8 // AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR_ASCAST]], align 8 -// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER:%.*]], ptr addrspace(1) [[TMP0]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[COERCE_DIVE]], align 8 -// AMDGCN-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR3]] +// AMDGCN-NEXT: [[U_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[U_ADDR_I]] to ptr +// AMDGCN-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[U_ADDR_ASCAST_I]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[U_ADDR_ASCAST_I]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(1) [[TMP1]], align 8 +// AMDGCN-NEXT: call void @FuncOneMember(<2 x i32> [[TMP2]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // kernel void KernelOneMemberSpir(global struct StructOneMember* u) { @@ -241,14 +262,18 @@ kernel void KernelOneMemberSpir(global struct StructOneMember* u) { // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember( // AMDGCN-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { // AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[COERCE_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr // AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 // AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 // AMDGCN-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr [[TMP0]], align 8 -// AMDGCN-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr align 8 [[U1]], i64 800, i1 false) -// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] +// AMDGCN-NEXT: [[U1_ASCAST:%.*]] = addrspacecast ptr [[U1]] to ptr addrspace(5) +// AMDGCN-NEXT: [[U_I:%.*]] = addrspacecast ptr addrspace(5) [[COERCE_I]] to ptr +// AMDGCN-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 8 [[U_I]], ptr addrspace(5) align 8 [[U1_ASCAST]], i64 800, i1 false) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP_I]], ptr align 8 [[U_I]], i64 800, i1 false) +// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP_I]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // kernel void KernelLargeOneMember(struct LargeStructOneMember u) { @@ -298,6 +323,7 @@ void FuncLargeTwoMember(struct LargeStructTwoMember u) { // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember( // AMDGCN-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { // AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr // AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 @@ -310,7 +336,14 @@ void FuncLargeTwoMember(struct LargeStructTwoMember u) { // AMDGCN-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr [[TMP4]], align 8 // AMDGCN-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 // AMDGCN-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8 -// AMDGCN-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR3]] +// AMDGCN-NEXT: [[U1_I:%.*]] = addrspacecast ptr addrspace(5) [[U_I]] to ptr +// AMDGCN-NEXT: store <2 x i32> [[TMP5]], ptr [[U1_I]], align 8 +// AMDGCN-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1_I]], i32 0, i32 1 +// AMDGCN-NEXT: store <2 x i32> [[TMP7]], ptr [[TMP8]], align 8 +// AMDGCN-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr [[U1_I]], align 8 +// AMDGCN-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1_I]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP11:%.*]] = load <2 x i32>, ptr [[TMP10]], align 8 +// AMDGCN-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP9]], <2 x i32> [[TMP11]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // kernel void KernelTwoMember(struct StructTwoMember u) { @@ -320,8 +353,9 @@ kernel void KernelTwoMember(struct StructTwoMember u) { // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember( // AMDGCN-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { // AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[COERCE_I:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr // AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 // AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0 @@ -329,8 +363,11 @@ kernel void KernelTwoMember(struct StructTwoMember u) { // AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 // AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1 // AMDGCN-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr [[TMP2]], align 8 -// AMDGCN-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr align 8 [[U1]], i64 480, i1 false) -// AMDGCN-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] +// AMDGCN-NEXT: [[U1_ASCAST:%.*]] = addrspacecast ptr [[U1]] to ptr addrspace(5) +// AMDGCN-NEXT: [[U_I:%.*]] = addrspacecast ptr addrspace(5) [[COERCE_I]] to ptr +// AMDGCN-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 8 [[U_I]], ptr addrspace(5) align 8 [[U1_ASCAST]], i64 480, i1 false) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP_I]], ptr align 8 [[U_I]], i64 480, i1 false) +// AMDGCN-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[BYVAL_TEMP_I]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { diff --git a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl index 6776e2227847e..4d09fc3ffb70b 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl @@ -423,7 +423,7 @@ struct_char_arr32 func_ret_struct_char_arr32() return s; } -// CHECK: define{{.*}} i32 @func_transparent_union_ret() local_unnamed_addr #1 { +// CHECK: define{{.*}} i32 @func_transparent_union_ret() local_unnamed_addr #[[ATTR1:[0-9]+]] { // CHECK: ret i32 0 transparent_u func_transparent_union_ret() { diff --git a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl index caae5666de29e..10ecfc0a1e6ba 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl @@ -61,10 +61,6 @@ kernel void test_target_features_kernel(global int *i) { } //. -// CHECK: @__test_block_invoke_kernel.runtime.handle = internal addrspace(1) externally_initialized constant %block.runtime.handle.t zeroinitializer, section ".amdgpu.kernel.runtime.handle" -// CHECK: @__test_block_invoke_2_kernel.runtime.handle = internal addrspace(1) externally_initialized constant %block.runtime.handle.t.0 zeroinitializer, section ".amdgpu.kernel.runtime.handle" -// CHECK: @__test_block_invoke_3_kernel.runtime.handle = internal addrspace(1) externally_initialized constant %block.runtime.handle.t.1 zeroinitializer, section ".amdgpu.kernel.runtime.handle" -// CHECK: @__test_block_invoke_4_kernel.runtime.handle = internal addrspace(1) externally_initialized constant %block.runtime.handle.t.2 zeroinitializer, section ".amdgpu.kernel.runtime.handle" // CHECK: @__block_literal_global = internal addrspace(1) constant { i32, i32, ptr } { i32 16, i32 8, ptr @__test_target_features_kernel_block_invoke }, align 8 #0 // CHECK: @__test_target_features_kernel_block_invoke_kernel.runtime.handle = internal addrspace(1) externally_initialized constant %block.runtime.handle.t.3 zeroinitializer, section ".amdgpu.kernel.runtime.handle" // CHECK: @llvm.used = appending addrspace(1) global [10 x ptr] [ptr @__test_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle to ptr), ptr @__test_block_invoke_2_kernel, ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle to ptr), ptr @__test_block_invoke_3_kernel, ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to ptr), ptr @__test_block_invoke_4_kernel, ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle to ptr), ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle to ptr)], section "llvm.metadata" @@ -95,6 +91,30 @@ kernel void test_target_features_kernel(global int *i) { // NOCPU-NEXT: [[B_ADDR:%.*]] = alloca i8, align 1, addrspace(5) // NOCPU-NEXT: [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // NOCPU-NEXT: [[D_ADDR:%.*]] = alloca i64, align 8, addrspace(5) +// NOCPU-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// NOCPU-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// NOCPU-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// NOCPU-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr +// NOCPU-NEXT: store ptr addrspace(1) [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// NOCPU-NEXT: store i8 [[B]], ptr [[B_ADDR_ASCAST]], align 1 +// NOCPU-NEXT: store ptr addrspace(1) [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// NOCPU-NEXT: store i64 [[D]], ptr [[D_ADDR_ASCAST]], align 8 +// NOCPU-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8 +// NOCPU-NEXT: [[TMP1:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1 +// NOCPU-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8 +// NOCPU-NEXT: [[TMP3:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8 +// NOCPU-NEXT: call void @__clang_ocl_kern_imp_test(ptr addrspace(1) noundef align 1 [[TMP0]], i8 noundef signext [[TMP1]], ptr addrspace(1) noundef align 8 [[TMP2]], i64 noundef [[TMP3]]) #[[ATTR10:[0-9]+]] +// NOCPU-NEXT: ret void +// +// +// NOCPU: Function Attrs: alwaysinline convergent norecurse nounwind +// NOCPU-LABEL: define {{[^@]+}}@__clang_ocl_kern_imp_test +// NOCPU-SAME: (ptr addrspace(1) noundef align 1 [[A:%.*]], i8 noundef signext [[B:%.*]], ptr addrspace(1) noundef align 8 [[C:%.*]], i64 noundef [[D:%.*]]) #[[ATTR3:[0-9]+]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META5]] !kernel_arg_base_type [[META5]] !kernel_arg_type_qual [[META6]] { +// NOCPU-NEXT: entry: +// NOCPU-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// NOCPU-NEXT: [[B_ADDR:%.*]] = alloca i8, align 1, addrspace(5) +// NOCPU-NEXT: [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// NOCPU-NEXT: [[D_ADDR:%.*]] = alloca i64, align 8, addrspace(5) // NOCPU-NEXT: [[DEFAULT_QUEUE:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // NOCPU-NEXT: [[FLAGS:%.*]] = alloca i32, align 4, addrspace(5) // NOCPU-NEXT: [[NDRANGE:%.*]] = alloca [[STRUCT_NDRANGE_T:%.*]], align 4, addrspace(5) @@ -145,7 +165,7 @@ kernel void test_target_features_kernel(global int *i) { // NOCPU-NEXT: [[BLOCK_CAPTURED1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 4 // NOCPU-NEXT: [[TMP3:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1 // NOCPU-NEXT: store i8 [[TMP3]], ptr [[BLOCK_CAPTURED1]], align 8 -// NOCPU-NEXT: [[TMP4:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr addrspace(5) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle to ptr), ptr [[BLOCK_ASCAST]]) +// NOCPU-NEXT: [[TMP4:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr [[TMP_ASCAST]], ptr @__test_block_invoke_kernel, ptr [[BLOCK_ASCAST]]) // NOCPU-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8 // NOCPU-NEXT: [[TMP6:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4 // NOCPU-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP2_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false) @@ -167,7 +187,7 @@ kernel void test_target_features_kernel(global int *i) { // NOCPU-NEXT: [[BLOCK_CAPTURED10:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 5 // NOCPU-NEXT: [[TMP10:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8 // NOCPU-NEXT: store i64 [[TMP10]], ptr [[BLOCK_CAPTURED10]], align 8 -// NOCPU-NEXT: [[TMP11:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP5]], i32 [[TMP6]], ptr addrspace(5) [[VARTMP2]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle to ptr), ptr [[BLOCK3_ASCAST]]) +// NOCPU-NEXT: [[TMP11:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP5]], i32 [[TMP6]], ptr [[TMP2_ASCAST]], ptr @__test_block_invoke_2_kernel, ptr [[BLOCK3_ASCAST]]) // NOCPU-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8 // NOCPU-NEXT: [[TMP13:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4 // NOCPU-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP11_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false) @@ -191,7 +211,7 @@ kernel void test_target_features_kernel(global int *i) { // NOCPU-NEXT: store i64 [[TMP17]], ptr [[BLOCK_CAPTURED19]], align 8 // NOCPU-NEXT: [[TMP18:%.*]] = getelementptr [1 x i64], ptr [[BLOCK_SIZES_ASCAST]], i32 0, i32 0 // NOCPU-NEXT: store i64 100, ptr [[TMP18]], align 8 -// NOCPU-NEXT: [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP12]], i32 [[TMP13]], ptr addrspace(5) [[VARTMP11]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to ptr), ptr [[BLOCK12_ASCAST]], i32 1, ptr [[TMP18]]) +// NOCPU-NEXT: [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP12]], i32 [[TMP13]], ptr [[TMP11_ASCAST]], ptr @__test_block_invoke_3_kernel, ptr [[BLOCK12_ASCAST]], i32 1, ptr [[TMP18]]) // NOCPU-NEXT: [[BLOCK_SIZE22:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 0 // NOCPU-NEXT: store i32 32, ptr [[BLOCK_SIZE22]], align 8 // NOCPU-NEXT: [[BLOCK_ALIGN23:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 1 @@ -209,13 +229,49 @@ kernel void test_target_features_kernel(global int *i) { // NOCPU-NEXT: [[TMP23:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4 // NOCPU-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP27_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false) // NOCPU-NEXT: [[TMP24:%.*]] = load ptr, ptr [[BLOCK20_ASCAST]], align 8 -// NOCPU-NEXT: [[TMP25:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP22]], i32 [[TMP23]], ptr addrspace(5) [[VARTMP27]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle to ptr), ptr [[BLOCK21_ASCAST]]) +// NOCPU-NEXT: [[TMP25:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP22]], i32 [[TMP23]], ptr [[TMP27_ASCAST]], ptr @__test_block_invoke_4_kernel, ptr [[BLOCK21_ASCAST]]) +// NOCPU-NEXT: ret void +// +// +// NOCPU: Function Attrs: convergent noinline norecurse nounwind optnone +// NOCPU-LABEL: define {{[^@]+}}@test_target_features_kernel +// NOCPU-SAME: (ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR4:[0-9]+]] !kernel_arg_addr_space [[META7:![0-9]+]] !kernel_arg_access_qual [[META8:![0-9]+]] !kernel_arg_type [[META9:![0-9]+]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META10:![0-9]+]] { +// NOCPU-NEXT: entry: +// NOCPU-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// NOCPU-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// NOCPU-NEXT: store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align 8 +// NOCPU-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[I_ADDR_ASCAST]], align 8 +// NOCPU-NEXT: call void @__clang_ocl_kern_imp_test_target_features_kernel(ptr addrspace(1) noundef align 4 [[TMP0]]) #[[ATTR10]] +// NOCPU-NEXT: ret void +// +// +// NOCPU: Function Attrs: alwaysinline convergent norecurse nounwind +// NOCPU-LABEL: define {{[^@]+}}@__clang_ocl_kern_imp_test_target_features_kernel +// NOCPU-SAME: (ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR5:[0-9]+]] !kernel_arg_addr_space [[META7]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META10]] { +// NOCPU-NEXT: entry: +// NOCPU-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// NOCPU-NEXT: [[DEFAULT_QUEUE:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// NOCPU-NEXT: [[FLAGS:%.*]] = alloca i32, align 4, addrspace(5) +// NOCPU-NEXT: [[NDRANGE:%.*]] = alloca [[STRUCT_NDRANGE_T:%.*]], align 4, addrspace(5) +// NOCPU-NEXT: [[TMP:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5) +// NOCPU-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// NOCPU-NEXT: [[DEFAULT_QUEUE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DEFAULT_QUEUE]] to ptr +// NOCPU-NEXT: [[FLAGS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[FLAGS]] to ptr +// NOCPU-NEXT: [[NDRANGE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[NDRANGE]] to ptr +// NOCPU-NEXT: [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr +// NOCPU-NEXT: store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align 8 +// NOCPU-NEXT: store i32 0, ptr [[FLAGS_ASCAST]], align 4 +// NOCPU-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime() +// NOCPU-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8 +// NOCPU-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4 +// NOCPU-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false) +// NOCPU-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr [[TMP_ASCAST]], ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr)) // NOCPU-NEXT: ret void // // // NOCPU: Function Attrs: convergent noinline nounwind optnone // NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke -// NOCPU-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR4:[0-9]+]] { +// NOCPU-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR7:[0-9]+]] { // NOCPU-NEXT: entry: // NOCPU-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // NOCPU-NEXT: [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) @@ -234,7 +290,7 @@ kernel void test_target_features_kernel(global int *i) { // // NOCPU: Function Attrs: convergent nounwind // NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_kernel -// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] !associated [[META7:![0-9]+]] !kernel_arg_addr_space [[META8:![0-9]+]] !kernel_arg_access_qual [[META9:![0-9]+]] !kernel_arg_type [[META10:![0-9]+]] !kernel_arg_base_type [[META10]] !kernel_arg_type_qual [[META11:![0-9]+]] { +// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0:%.*]]) #[[ATTR8:[0-9]+]] !kernel_arg_addr_space [[META11:![0-9]+]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META10]] { // NOCPU-NEXT: entry: // NOCPU-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), i8 }>, align 8, addrspace(5) // NOCPU-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 @@ -245,7 +301,7 @@ kernel void test_target_features_kernel(global int *i) { // // NOCPU: Function Attrs: convergent noinline nounwind optnone // NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_2 -// NOCPU-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR4]] { +// NOCPU-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR7]] { // NOCPU-NEXT: entry: // NOCPU-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // NOCPU-NEXT: [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) @@ -270,7 +326,7 @@ kernel void test_target_features_kernel(global int *i) { // // NOCPU: Function Attrs: convergent nounwind // NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_2_kernel -// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]]) #[[ATTR5]] !associated [[META12:![0-9]+]] !kernel_arg_addr_space [[META8]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META10]] !kernel_arg_base_type [[META10]] !kernel_arg_type_qual [[META11]] { +// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]]) #[[ATTR8]] !kernel_arg_addr_space [[META11]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META10]] { // NOCPU-NEXT: entry: // NOCPU-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5) // NOCPU-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 @@ -281,7 +337,7 @@ kernel void test_target_features_kernel(global int *i) { // // NOCPU: Function Attrs: convergent noinline nounwind optnone // NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_3 -// NOCPU-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]], ptr addrspace(3) noundef [[LP:%.*]]) #[[ATTR4]] { +// NOCPU-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]], ptr addrspace(3) noundef [[LP:%.*]]) #[[ATTR7]] { // NOCPU-NEXT: entry: // NOCPU-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // NOCPU-NEXT: [[LP_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) @@ -312,7 +368,7 @@ kernel void test_target_features_kernel(global int *i) { // // NOCPU: Function Attrs: convergent nounwind // NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_3_kernel -// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) #[[ATTR5]] !associated [[META13:![0-9]+]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META15:![0-9]+]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META17:![0-9]+]] { +// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) #[[ATTR8]] !kernel_arg_addr_space [[META13:![0-9]+]] !kernel_arg_access_qual [[META14:![0-9]+]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META16:![0-9]+]] { // NOCPU-NEXT: entry: // NOCPU-NEXT: [[TMP2:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5) // NOCPU-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP2]], align 8 @@ -323,7 +379,7 @@ kernel void test_target_features_kernel(global int *i) { // // NOCPU: Function Attrs: convergent noinline nounwind optnone // NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_4 -// NOCPU-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR4]] { +// NOCPU-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR7]] { // NOCPU-NEXT: entry: // NOCPU-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // NOCPU-NEXT: [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) @@ -335,13 +391,13 @@ kernel void test_target_features_kernel(global int *i) { // NOCPU-NEXT: [[TMP0:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR]], align 8 // NOCPU-NEXT: [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4 // NOCPU-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8 -// NOCPU-NEXT: call void @callee(i64 noundef [[TMP0]], ptr addrspace(1) noundef [[TMP1]]) #[[ATTR8:[0-9]+]] +// NOCPU-NEXT: call void @callee(i64 noundef [[TMP0]], ptr addrspace(1) noundef [[TMP1]]) #[[ATTR11:[0-9]+]] // NOCPU-NEXT: ret void // // // NOCPU: Function Attrs: convergent nounwind // NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_4_kernel -// NOCPU-SAME: (<{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0:%.*]]) #[[ATTR5]] !associated [[META18:![0-9]+]] !kernel_arg_addr_space [[META8]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META10]] !kernel_arg_base_type [[META10]] !kernel_arg_type_qual [[META11]] { +// NOCPU-SAME: (<{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0:%.*]]) #[[ATTR8]] !kernel_arg_addr_space [[META11]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META10]] { // NOCPU-NEXT: entry: // NOCPU-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, i64, ptr addrspace(1) }>, align 8, addrspace(5) // NOCPU-NEXT: store <{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 @@ -350,33 +406,9 @@ kernel void test_target_features_kernel(global int *i) { // NOCPU-NEXT: ret void // // -// NOCPU: Function Attrs: convergent noinline norecurse nounwind optnone -// NOCPU-LABEL: define {{[^@]+}}@test_target_features_kernel -// NOCPU-SAME: (ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR6:[0-9]+]] !kernel_arg_addr_space [[META19:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META20:![0-9]+]] !kernel_arg_base_type [[META20]] !kernel_arg_type_qual [[META11]] { -// NOCPU-NEXT: entry: -// NOCPU-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// NOCPU-NEXT: [[DEFAULT_QUEUE:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// NOCPU-NEXT: [[FLAGS:%.*]] = alloca i32, align 4, addrspace(5) -// NOCPU-NEXT: [[NDRANGE:%.*]] = alloca [[STRUCT_NDRANGE_T:%.*]], align 4, addrspace(5) -// NOCPU-NEXT: [[TMP:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5) -// NOCPU-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr -// NOCPU-NEXT: [[DEFAULT_QUEUE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DEFAULT_QUEUE]] to ptr -// NOCPU-NEXT: [[FLAGS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[FLAGS]] to ptr -// NOCPU-NEXT: [[NDRANGE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[NDRANGE]] to ptr -// NOCPU-NEXT: [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr -// NOCPU-NEXT: store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align 8 -// NOCPU-NEXT: store i32 0, ptr [[FLAGS_ASCAST]], align 4 -// NOCPU-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime() -// NOCPU-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8 -// NOCPU-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4 -// NOCPU-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false) -// NOCPU-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr addrspace(5) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle to ptr), ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr)) -// NOCPU-NEXT: ret void -// -// // NOCPU: Function Attrs: convergent noinline nounwind optnone // NOCPU-LABEL: define {{[^@]+}}@__test_target_features_kernel_block_invoke -// NOCPU-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR4]] { +// NOCPU-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR7]] { // NOCPU-NEXT: entry: // NOCPU-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // NOCPU-NEXT: [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) @@ -390,7 +422,7 @@ kernel void test_target_features_kernel(global int *i) { // // NOCPU: Function Attrs: convergent nounwind // NOCPU-LABEL: define {{[^@]+}}@__test_target_features_kernel_block_invoke_kernel -// NOCPU-SAME: ({ i32, i32, ptr } [[TMP0:%.*]]) #[[ATTR5]] !associated [[META21:![0-9]+]] !kernel_arg_addr_space [[META8]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META10]] !kernel_arg_base_type [[META10]] !kernel_arg_type_qual [[META11]] { +// NOCPU-SAME: ({ i32, i32, ptr } [[TMP0:%.*]]) #[[ATTR8]] !kernel_arg_addr_space [[META11]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META10]] { // NOCPU-NEXT: entry: // NOCPU-NEXT: [[TMP1:%.*]] = alloca { i32, i32, ptr }, align 8, addrspace(5) // NOCPU-NEXT: store { i32, i32, ptr } [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 @@ -412,6 +444,8 @@ kernel void test_target_features_kernel(global int *i) { // // // +// +// // GFX900: Function Attrs: convergent norecurse nounwind // GFX900-LABEL: define {{[^@]+}}@callee // GFX900-SAME: (i64 noundef [[ID:%.*]], ptr addrspace(1) noundef [[OUT:%.*]]) #[[ATTR1:[0-9]+]] { @@ -438,6 +472,30 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: [[B_ADDR:%.*]] = alloca i8, align 1, addrspace(5) // GFX900-NEXT: [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // GFX900-NEXT: [[D_ADDR:%.*]] = alloca i64, align 8, addrspace(5) +// GFX900-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// GFX900-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// GFX900-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// GFX900-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr +// GFX900-NEXT: store ptr addrspace(1) [[A]], ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[TBAA14:![0-9]+]] +// GFX900-NEXT: store i8 [[B]], ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[TBAA16:![0-9]+]] +// GFX900-NEXT: store ptr addrspace(1) [[C]], ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[TBAA7]] +// GFX900-NEXT: store i64 [[D]], ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]] +// GFX900-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[TBAA14]] +// GFX900-NEXT: [[TMP1:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[TBAA16]] +// GFX900-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[TBAA7]] +// GFX900-NEXT: [[TMP3:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]] +// GFX900-NEXT: call void @__clang_ocl_kern_imp_test(ptr addrspace(1) noundef align 1 [[TMP0]], i8 noundef signext [[TMP1]], ptr addrspace(1) noundef align 8 [[TMP2]], i64 noundef [[TMP3]]) #[[ATTR9:[0-9]+]] +// GFX900-NEXT: ret void +// +// +// GFX900: Function Attrs: alwaysinline convergent norecurse nounwind +// GFX900-LABEL: define {{[^@]+}}@__clang_ocl_kern_imp_test +// GFX900-SAME: (ptr addrspace(1) noundef align 1 [[A:%.*]], i8 noundef signext [[B:%.*]], ptr addrspace(1) noundef align 8 [[C:%.*]], i64 noundef [[D:%.*]]) #[[ATTR3:[0-9]+]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13]] { +// GFX900-NEXT: entry: +// GFX900-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// GFX900-NEXT: [[B_ADDR:%.*]] = alloca i8, align 1, addrspace(5) +// GFX900-NEXT: [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// GFX900-NEXT: [[D_ADDR:%.*]] = alloca i64, align 8, addrspace(5) // GFX900-NEXT: [[DEFAULT_QUEUE:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // GFX900-NEXT: [[FLAGS:%.*]] = alloca i32, align 4, addrspace(5) // GFX900-NEXT: [[NDRANGE:%.*]] = alloca [[STRUCT_NDRANGE_T:%.*]], align 4, addrspace(5) @@ -468,14 +526,14 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: [[BLOCK20_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK20]] to ptr // GFX900-NEXT: [[BLOCK21_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK21]] to ptr // GFX900-NEXT: [[TMP27_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VARTMP27]] to ptr -// GFX900-NEXT: store ptr addrspace(1) [[A]], ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[TBAA14:![0-9]+]] -// GFX900-NEXT: store i8 [[B]], ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[TBAA16:![0-9]+]] +// GFX900-NEXT: store ptr addrspace(1) [[A]], ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[TBAA14]] +// GFX900-NEXT: store i8 [[B]], ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[TBAA16]] // GFX900-NEXT: store ptr addrspace(1) [[C]], ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[TBAA7]] // GFX900-NEXT: store i64 [[D]], ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]] -// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR7:[0-9]+]] -// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR7]] +// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR10:[0-9]+]] +// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR10]] // GFX900-NEXT: store i32 0, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17:![0-9]+]] -// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR7]] +// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR10]] // GFX900-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8, !tbaa [[TBAA19:![0-9]+]] // GFX900-NEXT: [[TMP1:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]] // GFX900-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21:![0-9]+]] @@ -491,7 +549,7 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: [[BLOCK_CAPTURED1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 4 // GFX900-NEXT: [[TMP3:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[TBAA16]] // GFX900-NEXT: store i8 [[TMP3]], ptr [[BLOCK_CAPTURED1]], align 8, !tbaa [[TBAA16]] -// GFX900-NEXT: [[TMP4:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr addrspace(5) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle to ptr), ptr [[BLOCK_ASCAST]]) +// GFX900-NEXT: [[TMP4:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr [[TMP_ASCAST]], ptr @__test_block_invoke_kernel, ptr [[BLOCK_ASCAST]]) // GFX900-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8, !tbaa [[TBAA19]] // GFX900-NEXT: [[TMP6:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]] // GFX900-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP2_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]] @@ -513,7 +571,7 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: [[BLOCK_CAPTURED10:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 5 // GFX900-NEXT: [[TMP10:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]] // GFX900-NEXT: store i64 [[TMP10]], ptr [[BLOCK_CAPTURED10]], align 8, !tbaa [[TBAA3]] -// GFX900-NEXT: [[TMP11:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP5]], i32 [[TMP6]], ptr addrspace(5) [[VARTMP2]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle to ptr), ptr [[BLOCK3_ASCAST]]) +// GFX900-NEXT: [[TMP11:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP5]], i32 [[TMP6]], ptr [[TMP2_ASCAST]], ptr @__test_block_invoke_2_kernel, ptr [[BLOCK3_ASCAST]]) // GFX900-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8, !tbaa [[TBAA19]] // GFX900-NEXT: [[TMP13:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]] // GFX900-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP11_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]] @@ -535,12 +593,12 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: [[BLOCK_CAPTURED19:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 5 // GFX900-NEXT: [[TMP17:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]] // GFX900-NEXT: store i64 [[TMP17]], ptr [[BLOCK_CAPTURED19]], align 8, !tbaa [[TBAA3]] -// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR7]] +// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR10]] // GFX900-NEXT: [[TMP18:%.*]] = getelementptr [1 x i64], ptr [[BLOCK_SIZES_ASCAST]], i32 0, i32 0 // GFX900-NEXT: store i64 100, ptr [[TMP18]], align 8 -// GFX900-NEXT: [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP12]], i32 [[TMP13]], ptr addrspace(5) [[VARTMP11]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to ptr), ptr [[BLOCK12_ASCAST]], i32 1, ptr [[TMP18]]) -// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR7]] -// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK20]]) #[[ATTR7]] +// GFX900-NEXT: [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP12]], i32 [[TMP13]], ptr [[TMP11_ASCAST]], ptr @__test_block_invoke_3_kernel, ptr [[BLOCK12_ASCAST]], i32 1, ptr [[TMP18]]) +// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR10]] +// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK20]]) #[[ATTR10]] // GFX900-NEXT: [[BLOCK_SIZE22:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 0 // GFX900-NEXT: store i32 32, ptr [[BLOCK_SIZE22]], align 8 // GFX900-NEXT: [[BLOCK_ALIGN23:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 1 @@ -558,17 +616,59 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: [[TMP23:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]] // GFX900-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP27_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]] // GFX900-NEXT: [[TMP24:%.*]] = load ptr, ptr [[BLOCK20_ASCAST]], align 8, !tbaa [[TBAA16]] -// GFX900-NEXT: [[TMP25:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP22]], i32 [[TMP23]], ptr addrspace(5) [[VARTMP27]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle to ptr), ptr [[BLOCK21_ASCAST]]) -// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[BLOCK20]]) #[[ATTR7]] -// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR7]] -// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR7]] -// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR7]] +// GFX900-NEXT: [[TMP25:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP22]], i32 [[TMP23]], ptr [[TMP27_ASCAST]], ptr @__test_block_invoke_4_kernel, ptr [[BLOCK21_ASCAST]]) +// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[BLOCK20]]) #[[ATTR10]] +// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR10]] +// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR10]] +// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR10]] +// GFX900-NEXT: ret void +// +// +// GFX900: Function Attrs: convergent norecurse nounwind +// GFX900-LABEL: define {{[^@]+}}@test_target_features_kernel +// GFX900-SAME: (ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META22:![0-9]+]] !kernel_arg_access_qual [[META23:![0-9]+]] !kernel_arg_type [[META24:![0-9]+]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25:![0-9]+]] { +// GFX900-NEXT: entry: +// GFX900-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// GFX900-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// GFX900-NEXT: store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align 8, !tbaa [[TBAA26:![0-9]+]] +// GFX900-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[I_ADDR_ASCAST]], align 8, !tbaa [[TBAA26]] +// GFX900-NEXT: call void @__clang_ocl_kern_imp_test_target_features_kernel(ptr addrspace(1) noundef align 4 [[TMP0]]) #[[ATTR9]] +// GFX900-NEXT: ret void +// +// +// GFX900: Function Attrs: alwaysinline convergent norecurse nounwind +// GFX900-LABEL: define {{[^@]+}}@__clang_ocl_kern_imp_test_target_features_kernel +// GFX900-SAME: (ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META22]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META24]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25]] { +// GFX900-NEXT: entry: +// GFX900-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// GFX900-NEXT: [[DEFAULT_QUEUE:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// GFX900-NEXT: [[FLAGS:%.*]] = alloca i32, align 4, addrspace(5) +// GFX900-NEXT: [[NDRANGE:%.*]] = alloca [[STRUCT_NDRANGE_T:%.*]], align 4, addrspace(5) +// GFX900-NEXT: [[TMP:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5) +// GFX900-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// GFX900-NEXT: [[DEFAULT_QUEUE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DEFAULT_QUEUE]] to ptr +// GFX900-NEXT: [[FLAGS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[FLAGS]] to ptr +// GFX900-NEXT: [[NDRANGE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[NDRANGE]] to ptr +// GFX900-NEXT: [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr +// GFX900-NEXT: store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align 8, !tbaa [[TBAA26]] +// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR10]] +// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR10]] +// GFX900-NEXT: store i32 0, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]] +// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR10]] +// GFX900-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime() +// GFX900-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8, !tbaa [[TBAA19]] +// GFX900-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]] +// GFX900-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]] +// GFX900-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr [[TMP_ASCAST]], ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr)) +// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR10]] +// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR10]] +// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR10]] // GFX900-NEXT: ret void // // // GFX900: Function Attrs: convergent nounwind // GFX900-LABEL: define {{[^@]+}}@__test_block_invoke -// GFX900-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR5:[0-9]+]] { +// GFX900-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR6:[0-9]+]] { // GFX900-NEXT: entry: // GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr @@ -584,7 +684,7 @@ kernel void test_target_features_kernel(global int *i) { // // GFX900: Function Attrs: convergent nounwind // GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_kernel -// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0:%.*]]) #[[ATTR5]] !associated [[META22:![0-9]+]] !kernel_arg_addr_space [[META23:![0-9]+]] !kernel_arg_access_qual [[META24:![0-9]+]] !kernel_arg_type [[META25:![0-9]+]] !kernel_arg_base_type [[META25]] !kernel_arg_type_qual [[META26:![0-9]+]] { +// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0:%.*]]) #[[ATTR7:[0-9]+]] !kernel_arg_addr_space [[META28:![0-9]+]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META29:![0-9]+]] !kernel_arg_base_type [[META29]] !kernel_arg_type_qual [[META25]] { // GFX900-NEXT: entry: // GFX900-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), i8 }>, align 8, addrspace(5) // GFX900-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 @@ -595,7 +695,7 @@ kernel void test_target_features_kernel(global int *i) { // // GFX900: Function Attrs: convergent nounwind // GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_2 -// GFX900-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR5]] { +// GFX900-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR6]] { // GFX900-NEXT: entry: // GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr @@ -617,7 +717,7 @@ kernel void test_target_features_kernel(global int *i) { // // GFX900: Function Attrs: convergent nounwind // GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_2_kernel -// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]]) #[[ATTR5]] !associated [[META27:![0-9]+]] !kernel_arg_addr_space [[META23]] !kernel_arg_access_qual [[META24]] !kernel_arg_type [[META25]] !kernel_arg_base_type [[META25]] !kernel_arg_type_qual [[META26]] { +// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]]) #[[ATTR7]] !kernel_arg_addr_space [[META28]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META29]] !kernel_arg_base_type [[META29]] !kernel_arg_type_qual [[META25]] { // GFX900-NEXT: entry: // GFX900-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5) // GFX900-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 @@ -628,14 +728,14 @@ kernel void test_target_features_kernel(global int *i) { // // GFX900: Function Attrs: convergent nounwind // GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_3 -// GFX900-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]], ptr addrspace(3) noundef [[LP:%.*]]) #[[ATTR5]] { +// GFX900-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]], ptr addrspace(3) noundef [[LP:%.*]]) #[[ATTR6]] { // GFX900-NEXT: entry: // GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // GFX900-NEXT: [[LP_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) // GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr // GFX900-NEXT: [[LP_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[LP_ADDR]] to ptr // GFX900-NEXT: store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8 -// GFX900-NEXT: store ptr addrspace(3) [[LP]], ptr [[LP_ADDR_ASCAST]], align 4, !tbaa [[TBAA28:![0-9]+]] +// GFX900-NEXT: store ptr addrspace(3) [[LP]], ptr [[LP_ADDR_ASCAST]], align 4, !tbaa [[TBAA30:![0-9]+]] // GFX900-NEXT: [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 6 // GFX900-NEXT: [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[TBAA16]] // GFX900-NEXT: [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3 @@ -648,7 +748,7 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR3]], align 8, !tbaa [[TBAA7]] // GFX900-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[TMP3]], i64 0 // GFX900-NEXT: store i64 [[TMP2]], ptr addrspace(1) [[ARRAYIDX4]], align 8, !tbaa [[TBAA3]] -// GFX900-NEXT: [[TMP4:%.*]] = load ptr addrspace(3), ptr [[LP_ADDR_ASCAST]], align 4, !tbaa [[TBAA28]] +// GFX900-NEXT: [[TMP4:%.*]] = load ptr addrspace(3), ptr [[LP_ADDR_ASCAST]], align 4, !tbaa [[TBAA30]] // GFX900-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[TMP4]], i64 0 // GFX900-NEXT: store i32 1, ptr addrspace(3) [[ARRAYIDX5]], align 4, !tbaa [[TBAA17]] // GFX900-NEXT: ret void @@ -656,7 +756,7 @@ kernel void test_target_features_kernel(global int *i) { // // GFX900: Function Attrs: convergent nounwind // GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_3_kernel -// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) #[[ATTR5]] !associated [[META29:![0-9]+]] !kernel_arg_addr_space [[META30:![0-9]+]] !kernel_arg_access_qual [[META31:![0-9]+]] !kernel_arg_type [[META32:![0-9]+]] !kernel_arg_base_type [[META32]] !kernel_arg_type_qual [[META33:![0-9]+]] { +// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) #[[ATTR7]] !kernel_arg_addr_space [[META32:![0-9]+]] !kernel_arg_access_qual [[META33:![0-9]+]] !kernel_arg_type [[META34:![0-9]+]] !kernel_arg_base_type [[META34]] !kernel_arg_type_qual [[META35:![0-9]+]] { // GFX900-NEXT: entry: // GFX900-NEXT: [[TMP2:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5) // GFX900-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP2]], align 8 @@ -667,7 +767,7 @@ kernel void test_target_features_kernel(global int *i) { // // GFX900: Function Attrs: convergent nounwind // GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_4 -// GFX900-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR5]] { +// GFX900-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR6]] { // GFX900-NEXT: entry: // GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr @@ -676,13 +776,13 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: [[TMP0:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[TBAA3]] // GFX900-NEXT: [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4 // GFX900-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8, !tbaa [[TBAA7]] -// GFX900-NEXT: call void @callee(i64 noundef [[TMP0]], ptr addrspace(1) noundef [[TMP1]]) #[[ATTR8:[0-9]+]] +// GFX900-NEXT: call void @callee(i64 noundef [[TMP0]], ptr addrspace(1) noundef [[TMP1]]) #[[ATTR11:[0-9]+]] // GFX900-NEXT: ret void // // // GFX900: Function Attrs: convergent nounwind // GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_4_kernel -// GFX900-SAME: (<{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0:%.*]]) #[[ATTR5]] !associated [[META34:![0-9]+]] !kernel_arg_addr_space [[META23]] !kernel_arg_access_qual [[META24]] !kernel_arg_type [[META25]] !kernel_arg_base_type [[META25]] !kernel_arg_type_qual [[META26]] { +// GFX900-SAME: (<{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0:%.*]]) #[[ATTR7]] !kernel_arg_addr_space [[META28]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META29]] !kernel_arg_base_type [[META29]] !kernel_arg_type_qual [[META25]] { // GFX900-NEXT: entry: // GFX900-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, i64, ptr addrspace(1) }>, align 8, addrspace(5) // GFX900-NEXT: store <{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 @@ -691,39 +791,9 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: ret void // // -// GFX900: Function Attrs: convergent norecurse nounwind -// GFX900-LABEL: define {{[^@]+}}@test_target_features_kernel -// GFX900-SAME: (ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META35:![0-9]+]] !kernel_arg_access_qual [[META24]] !kernel_arg_type [[META36:![0-9]+]] !kernel_arg_base_type [[META36]] !kernel_arg_type_qual [[META26]] { -// GFX900-NEXT: entry: -// GFX900-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// GFX900-NEXT: [[DEFAULT_QUEUE:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// GFX900-NEXT: [[FLAGS:%.*]] = alloca i32, align 4, addrspace(5) -// GFX900-NEXT: [[NDRANGE:%.*]] = alloca [[STRUCT_NDRANGE_T:%.*]], align 4, addrspace(5) -// GFX900-NEXT: [[TMP:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5) -// GFX900-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr -// GFX900-NEXT: [[DEFAULT_QUEUE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DEFAULT_QUEUE]] to ptr -// GFX900-NEXT: [[FLAGS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[FLAGS]] to ptr -// GFX900-NEXT: [[NDRANGE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[NDRANGE]] to ptr -// GFX900-NEXT: [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr -// GFX900-NEXT: store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align 8, !tbaa [[TBAA37:![0-9]+]] -// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR7]] -// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR7]] -// GFX900-NEXT: store i32 0, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]] -// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR7]] -// GFX900-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime() -// GFX900-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8, !tbaa [[TBAA19]] -// GFX900-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]] -// GFX900-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]] -// GFX900-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr addrspace(5) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle to ptr), ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr)) -// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR7]] -// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR7]] -// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR7]] -// GFX900-NEXT: ret void -// -// // GFX900: Function Attrs: convergent nounwind // GFX900-LABEL: define {{[^@]+}}@__test_target_features_kernel_block_invoke -// GFX900-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR5]] { +// GFX900-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR6]] { // GFX900-NEXT: entry: // GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr @@ -734,7 +804,7 @@ kernel void test_target_features_kernel(global int *i) { // // GFX900: Function Attrs: convergent nounwind // GFX900-LABEL: define {{[^@]+}}@__test_target_features_kernel_block_invoke_kernel -// GFX900-SAME: ({ i32, i32, ptr } [[TMP0:%.*]]) #[[ATTR5]] !associated [[META39:![0-9]+]] !kernel_arg_addr_space [[META23]] !kernel_arg_access_qual [[META24]] !kernel_arg_type [[META25]] !kernel_arg_base_type [[META25]] !kernel_arg_type_qual [[META26]] { +// GFX900-SAME: ({ i32, i32, ptr } [[TMP0:%.*]]) #[[ATTR7]] !kernel_arg_addr_space [[META28]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META29]] !kernel_arg_base_type [[META29]] !kernel_arg_type_qual [[META25]] { // GFX900-NEXT: entry: // GFX900-NEXT: [[TMP1:%.*]] = alloca { i32, i32, ptr }, align 8, addrspace(5) // GFX900-NEXT: store { i32, i32, ptr } [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 @@ -746,22 +816,28 @@ kernel void test_target_features_kernel(global int *i) { // NOCPU: attributes #[[ATTR0:[0-9]+]] = { "objc_arc_inert" } // NOCPU: attributes #[[ATTR1]] = { convergent noinline norecurse nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } // NOCPU: attributes #[[ATTR2]] = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" } -// NOCPU: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } -// NOCPU: attributes #[[ATTR4]] = { convergent noinline nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } -// NOCPU: attributes #[[ATTR5]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } -// NOCPU: attributes #[[ATTR6]] = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+s-memtime-inst" "uniform-work-group-size"="false" } -// NOCPU: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind willreturn } -// NOCPU: attributes #[[ATTR8]] = { convergent nounwind } +// NOCPU: attributes #[[ATTR3]] = { alwaysinline convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" } +// NOCPU: attributes #[[ATTR4]] = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+s-memtime-inst" "uniform-work-group-size"="false" } +// NOCPU: attributes #[[ATTR5]] = { alwaysinline convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+s-memtime-inst" "uniform-work-group-size"="false" } +// NOCPU: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } +// NOCPU: attributes #[[ATTR7]] = { convergent noinline nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// NOCPU: attributes #[[ATTR8]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "enqueued-block" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// NOCPU: attributes #[[ATTR9:[0-9]+]] = { nocallback nofree nosync nounwind willreturn } +// NOCPU: attributes #[[ATTR10]] = { convergent nounwind "uniform-work-group-size"="false" } +// NOCPU: attributes #[[ATTR11]] = { convergent nounwind } //. // GFX900: attributes #[[ATTR0:[0-9]+]] = { "objc_arc_inert" } // GFX900: attributes #[[ATTR1]] = { convergent norecurse nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" } // GFX900: attributes #[[ATTR2]] = { convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" "uniform-work-group-size"="false" } -// GFX900: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } -// GFX900: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } -// GFX900: attributes #[[ATTR5]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" } -// GFX900: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind willreturn } -// GFX900: attributes #[[ATTR7]] = { nounwind } -// GFX900: attributes #[[ATTR8]] = { convergent nounwind } +// GFX900: attributes #[[ATTR3]] = { alwaysinline convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" "uniform-work-group-size"="false" } +// GFX900: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } +// GFX900: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } +// GFX900: attributes #[[ATTR6]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" } +// GFX900: attributes #[[ATTR7]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "enqueued-block" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" } +// GFX900: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nosync nounwind willreturn } +// GFX900: attributes #[[ATTR9]] = { convergent nounwind "uniform-work-group-size"="false" } +// GFX900: attributes #[[ATTR10]] = { nounwind } +// GFX900: attributes #[[ATTR11]] = { convergent nounwind } //. // NOCPU: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} // NOCPU: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} @@ -770,21 +846,16 @@ kernel void test_target_features_kernel(global int *i) { // NOCPU: [[META4]] = !{!"none", !"none", !"none", !"none"} // NOCPU: [[META5]] = !{!"char*", !"char", !"long*", !"long"} // NOCPU: [[META6]] = !{!"", !"", !"", !""} -// NOCPU: [[META7]] = !{ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle} -// NOCPU: [[META8]] = !{i32 0} -// NOCPU: [[META9]] = !{!"none"} -// NOCPU: [[META10]] = !{!"__block_literal"} -// NOCPU: [[META11]] = !{!""} -// NOCPU: [[META12]] = !{ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle} -// NOCPU: [[META13]] = !{ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle} -// NOCPU: [[META14]] = !{i32 0, i32 3} -// NOCPU: [[META15]] = !{!"none", !"none"} -// NOCPU: [[META16]] = !{!"__block_literal", !"void*"} -// NOCPU: [[META17]] = !{!"", !""} -// NOCPU: [[META18]] = !{ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle} -// NOCPU: [[META19]] = !{i32 1} -// NOCPU: [[META20]] = !{!"int*"} -// NOCPU: [[META21]] = !{ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle} +// NOCPU: [[META7]] = !{i32 1} +// NOCPU: [[META8]] = !{!"none"} +// NOCPU: [[META9]] = !{!"int*"} +// NOCPU: [[META10]] = !{!""} +// NOCPU: [[META11]] = !{i32 0} +// NOCPU: [[META12]] = !{!"__block_literal"} +// NOCPU: [[META13]] = !{i32 0, i32 3} +// NOCPU: [[META14]] = !{!"none", !"none"} +// NOCPU: [[META15]] = !{!"__block_literal", !"void*"} +// NOCPU: [[META16]] = !{!"", !""} //. // GFX900: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} // GFX900: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} @@ -808,24 +879,20 @@ kernel void test_target_features_kernel(global int *i) { // GFX900: [[TBAA19]] = !{[[META20:![0-9]+]], [[META20]], i64 0} // GFX900: [[META20]] = !{!"queue_t", [[META5]], i64 0} // GFX900: [[TBAA_STRUCT21]] = !{i64 0, i64 4, [[TBAA17]]} -// GFX900: [[META22]] = !{ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle} -// GFX900: [[META23]] = !{i32 0} -// GFX900: [[META24]] = !{!"none"} -// GFX900: [[META25]] = !{!"__block_literal"} -// GFX900: [[META26]] = !{!""} -// GFX900: [[META27]] = !{ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle} -// GFX900: [[TBAA28]] = !{[[META9]], [[META9]], i64 0} -// GFX900: [[META29]] = !{ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle} -// GFX900: [[META30]] = !{i32 0, i32 3} -// GFX900: [[META31]] = !{!"none", !"none"} -// GFX900: [[META32]] = !{!"__block_literal", !"void*"} -// GFX900: [[META33]] = !{!"", !""} -// GFX900: [[META34]] = !{ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle} -// GFX900: [[META35]] = !{i32 1} -// GFX900: [[META36]] = !{!"int*"} -// GFX900: [[TBAA37]] = !{[[META38:![0-9]+]], [[META38]], i64 0} -// GFX900: [[META38]] = !{!"p1 int", [[META9]], i64 0} -// GFX900: [[META39]] = !{ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle} +// GFX900: [[META22]] = !{i32 1} +// GFX900: [[META23]] = !{!"none"} +// GFX900: [[META24]] = !{!"int*"} +// GFX900: [[META25]] = !{!""} +// GFX900: [[TBAA26]] = !{[[META27:![0-9]+]], [[META27]], i64 0} +// GFX900: [[META27]] = !{!"p1 int", [[META9]], i64 0} +// GFX900: [[META28]] = !{i32 0} +// GFX900: [[META29]] = !{!"__block_literal"} +// GFX900: [[TBAA30]] = !{[[META31:![0-9]+]], [[META31]], i64 0} +// GFX900: [[META31]] = !{!"p1 void", [[META9]], i64 0} +// GFX900: [[META32]] = !{i32 0, i32 3} +// GFX900: [[META33]] = !{!"none", !"none"} +// GFX900: [[META34]] = !{!"__block_literal", !"void*"} +// GFX900: [[META35]] = !{!"", !""} //. //// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: // CHECK: {{.*}} diff --git a/clang/test/CodeGenOpenCL/amdgpu-printf.cl b/clang/test/CodeGenOpenCL/amdgpu-printf.cl index edf6dbf8657cb..33fee66d6e9fc 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-printf.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-printf.cl @@ -1,43 +1,86 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 4 // RUN: %clang_cc1 -cl-std=CL1.2 -triple amdgcn-amd-amdhsa -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s int printf(__constant const char* st, ...) __attribute__((format(printf, 1, 2))); -// CHECK-LABEL: @test_printf_noargs( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str) #[[ATTR4:[0-9]+]] -// CHECK-NEXT: ret void -// __kernel void test_printf_noargs() { printf(""); } -// CHECK-LABEL: @test_printf_int( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: store i32 [[I:%.*]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA8:![0-9]+]] -// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str.1, i32 noundef [[TMP0]]) #[[ATTR4]] -// CHECK-NEXT: ret void -// __kernel void test_printf_int(int i) { printf("%d", i); } -// CHECK-LABEL: @test_printf_str_int( +__kernel void test_printf_str_int(int i) { + char s[] = "foo"; + printf("%s:%d", s, i); +} +// CHECK-LABEL: define dso_local amdgpu_kernel void @test_printf_noargs( +// CHECK-SAME: ) #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META4]] !kernel_arg_base_type [[META4]] !kernel_arg_type_qual [[META4]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @__clang_ocl_kern_imp_test_printf_noargs() #[[ATTR5:[0-9]+]] +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define dso_local void @__clang_ocl_kern_imp_test_printf_noargs( +// CHECK-SAME: ) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META4]] !kernel_arg_base_type [[META4]] !kernel_arg_type_qual [[META4]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str) #[[ATTR6:[0-9]+]] +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define dso_local amdgpu_kernel void @test_printf_int( +// CHECK-SAME: i32 noundef [[I:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META5:![0-9]+]] !kernel_arg_access_qual [[META6:![0-9]+]] !kernel_arg_type [[META7:![0-9]+]] !kernel_arg_base_type [[META7]] !kernel_arg_type_qual [[META8:![0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: store i32 [[I]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA9:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA9]] +// CHECK-NEXT: call void @__clang_ocl_kern_imp_test_printf_int(i32 noundef [[TMP0]]) #[[ATTR5]] +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define dso_local void @__clang_ocl_kern_imp_test_printf_int( +// CHECK-SAME: i32 noundef [[I:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META5]] !kernel_arg_access_qual [[META6]] !kernel_arg_type [[META7]] !kernel_arg_base_type [[META7]] !kernel_arg_type_qual [[META8]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: store i32 [[I]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA9]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA9]] +// CHECK-NEXT: [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str.1, i32 noundef [[TMP0]]) #[[ATTR6]] +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define dso_local amdgpu_kernel void @test_printf_str_int( +// CHECK-SAME: i32 noundef [[I:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META5]] !kernel_arg_access_qual [[META6]] !kernel_arg_type [[META7]] !kernel_arg_base_type [[META7]] !kernel_arg_type_qual [[META8]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: store i32 [[I]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA9]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA9]] +// CHECK-NEXT: call void @__clang_ocl_kern_imp_test_printf_str_int(i32 noundef [[TMP0]]) #[[ATTR5]] +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define dso_local void @__clang_ocl_kern_imp_test_printf_str_int( +// CHECK-SAME: i32 noundef [[I:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META5]] !kernel_arg_access_qual [[META6]] !kernel_arg_type [[META7]] !kernel_arg_base_type [[META7]] !kernel_arg_type_qual [[META8]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[S:%.*]] = alloca [4 x i8], align 1, addrspace(5) -// CHECK-NEXT: store i32 [[I:%.*]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[S]]) #[[ATTR5:[0-9]+]] +// CHECK-NEXT: store i32 [[I]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA9]] +// CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[S]]) #[[ATTR7:[0-9]+]] // CHECK-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 1 [[S]], ptr addrspace(4) align 1 @__const.test_printf_str_int.s, i64 4, i1 false) // CHECK-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [4 x i8], ptr addrspace(5) [[S]], i64 0, i64 0 -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str.2, ptr addrspace(5) noundef [[ARRAYDECAY]], i32 noundef [[TMP2]]) #[[ATTR4]] -// CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[S]]) #[[ATTR5]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA9]] +// CHECK-NEXT: [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str.2, ptr addrspace(5) noundef [[ARRAYDECAY]], i32 noundef [[TMP0]]) #[[ATTR6]] +// CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[S]]) #[[ATTR7]] // CHECK-NEXT: ret void // -__kernel void test_printf_str_int(int i) { - char s[] = "foo"; - printf("%s:%d", s, i); -} +//. +// CHECK: [[META4]] = !{} +// CHECK: [[META5]] = !{i32 0} +// CHECK: [[META6]] = !{!"none"} +// CHECK: [[META7]] = !{!"int"} +// CHECK: [[META8]] = !{!""} +// CHECK: [[TBAA9]] = !{[[META10:![0-9]+]], [[META10]], i64 0} +// CHECK: [[META10]] = !{!"int", [[META11:![0-9]+]], i64 0} +// CHECK: [[META11]] = !{!"omnipotent char", [[META12:![0-9]+]], i64 0} +// CHECK: [[META12]] = !{!"Simple C/C++ TBAA"} +//. diff --git a/clang/test/CodeGenOpenCL/cl-uniform-wg-size.cl b/clang/test/CodeGenOpenCL/cl-uniform-wg-size.cl index d139621ede4e7..365135a251a24 100644 --- a/clang/test/CodeGenOpenCL/cl-uniform-wg-size.cl +++ b/clang/test/CodeGenOpenCL/cl-uniform-wg-size.cl @@ -4,14 +4,14 @@ // RUN: %clang_cc1 -emit-llvm -O0 -cl-std=CL2.0 -foffload-uniform-block -o - %s 2>&1 | FileCheck %s -check-prefixes CHECK,CHECK-UNIFORM kernel void ker() {}; -// CHECK: define{{.*}}@ker() #0 +// CHECK: define{{.*}}@ker() #[[ATTR0:[0-9]+]] void foo() {}; -// CHECK: define{{.*}}@foo() #1 +// CHECK: define{{.*}}@foo() #[[ATTR1:[0-9]+]] -// CHECK-LABEL: attributes #0 +// CHECK: attributes #[[ATTR0]] // CHECK-UNIFORM: "uniform-work-group-size"="true" // CHECK-NONUNIFORM: "uniform-work-group-size"="false" -// CHECK-LABEL: attributes #1 +// CHECK: attributes #[[ATTR1]] // CHECK-NOT: uniform-work-group-size diff --git a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl index 451d30b4d86f0..f5c342d1243ae 100644 --- a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl +++ b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl @@ -24,6 +24,53 @@ kernel void device_side_enqueue(global float *a, global float *b, int i) { // SPIR32-LABEL: define {{[^@]+}}@device_side_enqueue // SPIR32-SAME: (ptr addrspace(1) align 4 [[A:%.*]], ptr addrspace(1) align 4 [[B:%.*]], i32 [[I:%.*]]) #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META2:![0-9]+]] !kernel_arg_access_qual [[META3:![0-9]+]] !kernel_arg_type [[META4:![0-9]+]] !kernel_arg_base_type [[META4]] !kernel_arg_type_qual [[META5:![0-9]+]] { // SPIR32-NEXT: entry: +// SPIR32-NEXT: [[A_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR32-NEXT: [[B_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR32-NEXT: [[I_ADDR_I:%.*]] = alloca i32, align 4 +// SPIR32-NEXT: [[DEFAULT_QUEUE_I:%.*]] = alloca target("spirv.Queue"), align 4 +// SPIR32-NEXT: [[FLAGS_I:%.*]] = alloca i32, align 4 +// SPIR32-NEXT: [[NDRANGE_I:%.*]] = alloca [[STRUCT_NDRANGE_T:%.*]], align 4 +// SPIR32-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4 +// SPIR32-NEXT: [[BLOCK_I:%.*]] = alloca <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, align 4 +// SPIR32-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR32-NEXT: [[B_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR32-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4 +// SPIR32-NEXT: store ptr addrspace(1) [[A]], ptr [[A_ADDR]], align 4 +// SPIR32-NEXT: store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 4 +// SPIR32-NEXT: store i32 [[I]], ptr [[I_ADDR]], align 4 +// SPIR32-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 4 +// SPIR32-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 4 +// SPIR32-NEXT: [[TMP2:%.*]] = load i32, ptr [[I_ADDR]], align 4 +// SPIR32-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[A_ADDR_I]], align 4 +// SPIR32-NEXT: store ptr addrspace(1) [[TMP1]], ptr [[B_ADDR_I]], align 4 +// SPIR32-NEXT: store i32 [[TMP2]], ptr [[I_ADDR_I]], align 4 +// SPIR32-NEXT: store i32 0, ptr [[FLAGS_I]], align 4 +// SPIR32-NEXT: [[TMP3:%.*]] = load target("spirv.Queue"), ptr [[DEFAULT_QUEUE_I]], align 4 +// SPIR32-NEXT: [[TMP4:%.*]] = load i32, ptr [[FLAGS_I]], align 4 +// SPIR32-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP_I]], ptr align 4 [[NDRANGE_I]], i32 4, i1 false) +// SPIR32-NEXT: store i32 24, ptr [[BLOCK_I]], align 4 +// SPIR32-NEXT: [[BLOCK_ALIGN_I:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK_I]], i32 0, i32 1 +// SPIR32-NEXT: store i32 4, ptr [[BLOCK_ALIGN_I]], align 4 +// SPIR32-NEXT: [[BLOCK_INVOKE_I:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK_I]], i32 0, i32 2 +// SPIR32-NEXT: store ptr addrspace(4) addrspacecast (ptr @__device_side_enqueue_block_invoke to ptr addrspace(4)), ptr [[BLOCK_INVOKE_I]], align 4 +// SPIR32-NEXT: [[BLOCK_CAPTURED_I:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK_I]], i32 0, i32 3 +// SPIR32-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_I]], align 4 +// SPIR32-NEXT: store ptr addrspace(1) [[TMP5]], ptr [[BLOCK_CAPTURED_I]], align 4 +// SPIR32-NEXT: [[BLOCK_CAPTURED1_I:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK_I]], i32 0, i32 4 +// SPIR32-NEXT: [[TMP6:%.*]] = load i32, ptr [[I_ADDR_I]], align 4 +// SPIR32-NEXT: store i32 [[TMP6]], ptr [[BLOCK_CAPTURED1_I]], align 4 +// SPIR32-NEXT: [[BLOCK_CAPTURED2_I:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK_I]], i32 0, i32 5 +// SPIR32-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[B_ADDR_I]], align 4 +// SPIR32-NEXT: store ptr addrspace(1) [[TMP7]], ptr [[BLOCK_CAPTURED2_I]], align 4 +// SPIR32-NEXT: [[TMP8:%.*]] = addrspacecast ptr [[BLOCK_I]] to ptr addrspace(4) +// SPIR32-NEXT: [[TMP9:%.*]] = call spir_func i32 @__enqueue_kernel_basic(target("spirv.Queue") [[TMP3]], i32 [[TMP4]], ptr [[TMP_I]], ptr addrspace(4) addrspacecast (ptr @__device_side_enqueue_block_invoke_kernel to ptr addrspace(4)), ptr addrspace(4) [[TMP8]]) #[[ATTR6:[0-9]+]] +// SPIR32-NEXT: ret void +// +// +// SPIR32: Function Attrs: alwaysinline convergent norecurse nounwind +// SPIR32-LABEL: define {{[^@]+}}@__clang_ocl_kern_imp_device_side_enqueue +// SPIR32-SAME: (ptr addrspace(1) align 4 [[A:%.*]], ptr addrspace(1) align 4 [[B:%.*]], i32 [[I:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META2]] !kernel_arg_access_qual [[META3]] !kernel_arg_type [[META4]] !kernel_arg_base_type [[META4]] !kernel_arg_type_qual [[META5]] { +// SPIR32-NEXT: entry: // SPIR32-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 4 // SPIR32-NEXT: [[B_ADDR:%.*]] = alloca ptr addrspace(1), align 4 // SPIR32-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4 @@ -61,7 +108,7 @@ kernel void device_side_enqueue(global float *a, global float *b, int i) { // // SPIR32: Function Attrs: convergent noinline nounwind optnone // SPIR32-LABEL: define {{[^@]+}}@__device_side_enqueue_block_invoke -// SPIR32-SAME: (ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR2:[0-9]+]] { +// SPIR32-SAME: (ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR3:[0-9]+]] { // SPIR32-NEXT: entry: // SPIR32-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr addrspace(4), align 4 // SPIR32-NEXT: [[BLOCK_ADDR:%.*]] = alloca ptr addrspace(4), align 4 @@ -85,7 +132,7 @@ kernel void device_side_enqueue(global float *a, global float *b, int i) { // // SPIR32: Function Attrs: convergent nounwind // SPIR32-LABEL: define {{[^@]+}}@__device_side_enqueue_block_invoke_kernel -// SPIR32-SAME: (ptr addrspace(4) [[TMP0:%.*]]) #[[ATTR4:[0-9]+]] { +// SPIR32-SAME: (ptr addrspace(4) [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] { // SPIR32-NEXT: entry: // SPIR32-NEXT: call spir_func void @__device_side_enqueue_block_invoke(ptr addrspace(4) [[TMP0]]) // SPIR32-NEXT: ret void @@ -95,6 +142,53 @@ kernel void device_side_enqueue(global float *a, global float *b, int i) { // STRICTFP-LABEL: define {{[^@]+}}@device_side_enqueue // STRICTFP-SAME: (ptr addrspace(1) align 4 [[A:%.*]], ptr addrspace(1) align 4 [[B:%.*]], i32 [[I:%.*]]) #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META2:![0-9]+]] !kernel_arg_access_qual [[META3:![0-9]+]] !kernel_arg_type [[META4:![0-9]+]] !kernel_arg_base_type [[META4]] !kernel_arg_type_qual [[META5:![0-9]+]] { // STRICTFP-NEXT: entry: +// STRICTFP-NEXT: [[A_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4 +// STRICTFP-NEXT: [[B_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4 +// STRICTFP-NEXT: [[I_ADDR_I:%.*]] = alloca i32, align 4 +// STRICTFP-NEXT: [[DEFAULT_QUEUE_I:%.*]] = alloca target("spirv.Queue"), align 4 +// STRICTFP-NEXT: [[FLAGS_I:%.*]] = alloca i32, align 4 +// STRICTFP-NEXT: [[NDRANGE_I:%.*]] = alloca [[STRUCT_NDRANGE_T:%.*]], align 4 +// STRICTFP-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4 +// STRICTFP-NEXT: [[BLOCK_I:%.*]] = alloca <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, align 4 +// STRICTFP-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// STRICTFP-NEXT: [[B_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// STRICTFP-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4 +// STRICTFP-NEXT: store ptr addrspace(1) [[A]], ptr [[A_ADDR]], align 4 +// STRICTFP-NEXT: store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 4 +// STRICTFP-NEXT: store i32 [[I]], ptr [[I_ADDR]], align 4 +// STRICTFP-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 4 +// STRICTFP-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 4 +// STRICTFP-NEXT: [[TMP2:%.*]] = load i32, ptr [[I_ADDR]], align 4 +// STRICTFP-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[A_ADDR_I]], align 4 +// STRICTFP-NEXT: store ptr addrspace(1) [[TMP1]], ptr [[B_ADDR_I]], align 4 +// STRICTFP-NEXT: store i32 [[TMP2]], ptr [[I_ADDR_I]], align 4 +// STRICTFP-NEXT: store i32 0, ptr [[FLAGS_I]], align 4 +// STRICTFP-NEXT: [[TMP3:%.*]] = load target("spirv.Queue"), ptr [[DEFAULT_QUEUE_I]], align 4 +// STRICTFP-NEXT: [[TMP4:%.*]] = load i32, ptr [[FLAGS_I]], align 4 +// STRICTFP-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP_I]], ptr align 4 [[NDRANGE_I]], i32 4, i1 false) #[[ATTR6:[0-9]+]] +// STRICTFP-NEXT: store i32 24, ptr [[BLOCK_I]], align 4 +// STRICTFP-NEXT: [[BLOCK_ALIGN_I:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK_I]], i32 0, i32 1 +// STRICTFP-NEXT: store i32 4, ptr [[BLOCK_ALIGN_I]], align 4 +// STRICTFP-NEXT: [[BLOCK_INVOKE_I:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK_I]], i32 0, i32 2 +// STRICTFP-NEXT: store ptr addrspace(4) addrspacecast (ptr @__device_side_enqueue_block_invoke to ptr addrspace(4)), ptr [[BLOCK_INVOKE_I]], align 4 +// STRICTFP-NEXT: [[BLOCK_CAPTURED_I:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK_I]], i32 0, i32 3 +// STRICTFP-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_I]], align 4 +// STRICTFP-NEXT: store ptr addrspace(1) [[TMP5]], ptr [[BLOCK_CAPTURED_I]], align 4 +// STRICTFP-NEXT: [[BLOCK_CAPTURED1_I:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK_I]], i32 0, i32 4 +// STRICTFP-NEXT: [[TMP6:%.*]] = load i32, ptr [[I_ADDR_I]], align 4 +// STRICTFP-NEXT: store i32 [[TMP6]], ptr [[BLOCK_CAPTURED1_I]], align 4 +// STRICTFP-NEXT: [[BLOCK_CAPTURED2_I:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK_I]], i32 0, i32 5 +// STRICTFP-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[B_ADDR_I]], align 4 +// STRICTFP-NEXT: store ptr addrspace(1) [[TMP7]], ptr [[BLOCK_CAPTURED2_I]], align 4 +// STRICTFP-NEXT: [[TMP8:%.*]] = addrspacecast ptr [[BLOCK_I]] to ptr addrspace(4) +// STRICTFP-NEXT: [[TMP9:%.*]] = call spir_func i32 @__enqueue_kernel_basic(target("spirv.Queue") [[TMP3]], i32 [[TMP4]], ptr [[TMP_I]], ptr addrspace(4) addrspacecast (ptr @__device_side_enqueue_block_invoke_kernel to ptr addrspace(4)), ptr addrspace(4) [[TMP8]]) #[[ATTR7:[0-9]+]] +// STRICTFP-NEXT: ret void +// +// +// STRICTFP: Function Attrs: alwaysinline convergent norecurse nounwind strictfp +// STRICTFP-LABEL: define {{[^@]+}}@__clang_ocl_kern_imp_device_side_enqueue +// STRICTFP-SAME: (ptr addrspace(1) align 4 [[A:%.*]], ptr addrspace(1) align 4 [[B:%.*]], i32 [[I:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META2]] !kernel_arg_access_qual [[META3]] !kernel_arg_type [[META4]] !kernel_arg_base_type [[META4]] !kernel_arg_type_qual [[META5]] { +// STRICTFP-NEXT: entry: // STRICTFP-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 4 // STRICTFP-NEXT: [[B_ADDR:%.*]] = alloca ptr addrspace(1), align 4 // STRICTFP-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4 @@ -109,7 +203,7 @@ kernel void device_side_enqueue(global float *a, global float *b, int i) { // STRICTFP-NEXT: store i32 0, ptr [[FLAGS]], align 4 // STRICTFP-NEXT: [[TMP0:%.*]] = load target("spirv.Queue"), ptr [[DEFAULT_QUEUE]], align 4 // STRICTFP-NEXT: [[TMP1:%.*]] = load i32, ptr [[FLAGS]], align 4 -// STRICTFP-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP]], ptr align 4 [[NDRANGE]], i32 4, i1 false) #[[ATTR5:[0-9]+]] +// STRICTFP-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP]], ptr align 4 [[NDRANGE]], i32 4, i1 false) #[[ATTR6]] // STRICTFP-NEXT: [[BLOCK_SIZE:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK]], i32 0, i32 0 // STRICTFP-NEXT: store i32 24, ptr [[BLOCK_SIZE]], align 4 // STRICTFP-NEXT: [[BLOCK_ALIGN:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK]], i32 0, i32 1 @@ -126,13 +220,13 @@ kernel void device_side_enqueue(global float *a, global float *b, int i) { // STRICTFP-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 4 // STRICTFP-NEXT: store ptr addrspace(1) [[TMP4]], ptr [[BLOCK_CAPTURED2]], align 4 // STRICTFP-NEXT: [[TMP5:%.*]] = addrspacecast ptr [[BLOCK]] to ptr addrspace(4) -// STRICTFP-NEXT: [[TMP6:%.*]] = call spir_func i32 @__enqueue_kernel_basic(target("spirv.Queue") [[TMP0]], i32 [[TMP1]], ptr [[TMP]], ptr addrspace(4) addrspacecast (ptr @__device_side_enqueue_block_invoke_kernel to ptr addrspace(4)), ptr addrspace(4) [[TMP5]]) #[[ATTR5]] +// STRICTFP-NEXT: [[TMP6:%.*]] = call spir_func i32 @__enqueue_kernel_basic(target("spirv.Queue") [[TMP0]], i32 [[TMP1]], ptr [[TMP]], ptr addrspace(4) addrspacecast (ptr @__device_side_enqueue_block_invoke_kernel to ptr addrspace(4)), ptr addrspace(4) [[TMP5]]) #[[ATTR6]] // STRICTFP-NEXT: ret void // // // STRICTFP: Function Attrs: convergent noinline nounwind optnone strictfp // STRICTFP-LABEL: define {{[^@]+}}@__device_side_enqueue_block_invoke -// STRICTFP-SAME: (ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR2:[0-9]+]] { +// STRICTFP-SAME: (ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR3:[0-9]+]] { // STRICTFP-NEXT: entry: // STRICTFP-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr addrspace(4), align 4 // STRICTFP-NEXT: [[BLOCK_ADDR:%.*]] = alloca ptr addrspace(4), align 4 @@ -144,7 +238,7 @@ kernel void device_side_enqueue(global float *a, global float *b, int i) { // STRICTFP-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[BLOCK_CAPTURE_ADDR1]], align 4 // STRICTFP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[TMP0]], i32 [[TMP1]] // STRICTFP-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4 -// STRICTFP-NEXT: [[TMP3:%.*]] = call float @llvm.experimental.constrained.fmuladd.f32(float 4.000000e+00, float [[TMP2]], float 1.000000e+00, metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR5]] +// STRICTFP-NEXT: [[TMP3:%.*]] = call float @llvm.experimental.constrained.fmuladd.f32(float 4.000000e+00, float [[TMP2]], float 1.000000e+00, metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR6]] // STRICTFP-NEXT: [[BLOCK_CAPTURE_ADDR2:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3 // STRICTFP-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[BLOCK_CAPTURE_ADDR2]], align 4 // STRICTFP-NEXT: [[BLOCK_CAPTURE_ADDR3:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4 @@ -156,24 +250,28 @@ kernel void device_side_enqueue(global float *a, global float *b, int i) { // // STRICTFP: Function Attrs: convergent nounwind // STRICTFP-LABEL: define {{[^@]+}}@__device_side_enqueue_block_invoke_kernel -// STRICTFP-SAME: (ptr addrspace(4) [[TMP0:%.*]]) #[[ATTR4:[0-9]+]] { +// STRICTFP-SAME: (ptr addrspace(4) [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] { // STRICTFP-NEXT: entry: -// STRICTFP-NEXT: call spir_func void @__device_side_enqueue_block_invoke(ptr addrspace(4) [[TMP0]]) #[[ATTR5]] +// STRICTFP-NEXT: call spir_func void @__device_side_enqueue_block_invoke(ptr addrspace(4) [[TMP0]]) #[[ATTR6]] // STRICTFP-NEXT: ret void // //. // SPIR32: attributes #[[ATTR0]] = { convergent noinline norecurse nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } -// SPIR32: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } -// SPIR32: attributes #[[ATTR2]] = { convergent noinline nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } -// SPIR32: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -// SPIR32: attributes #[[ATTR4]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// SPIR32: attributes #[[ATTR1]] = { alwaysinline convergent norecurse nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } +// SPIR32: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } +// SPIR32: attributes #[[ATTR3]] = { convergent noinline nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// SPIR32: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +// SPIR32: attributes #[[ATTR5]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// SPIR32: attributes #[[ATTR6]] = { nounwind } //. // STRICTFP: attributes #[[ATTR0]] = { convergent noinline norecurse nounwind optnone strictfp "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" } -// STRICTFP: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } -// STRICTFP: attributes #[[ATTR2]] = { convergent noinline nounwind optnone strictfp "stack-protector-buffer-size"="8" } -// STRICTFP: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind strictfp willreturn memory(inaccessiblemem: readwrite) } -// STRICTFP: attributes #[[ATTR4]] = { convergent nounwind "stack-protector-buffer-size"="8" } -// STRICTFP: attributes #[[ATTR5]] = { strictfp } +// STRICTFP: attributes #[[ATTR1]] = { alwaysinline convergent norecurse nounwind strictfp "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" } +// STRICTFP: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } +// STRICTFP: attributes #[[ATTR3]] = { convergent noinline nounwind optnone strictfp "stack-protector-buffer-size"="8" } +// STRICTFP: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind strictfp willreturn memory(inaccessiblemem: readwrite) } +// STRICTFP: attributes #[[ATTR5]] = { convergent nounwind "stack-protector-buffer-size"="8" } +// STRICTFP: attributes #[[ATTR6]] = { strictfp } +// STRICTFP: attributes #[[ATTR7]] = { nounwind strictfp } //. // SPIR32: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} // SPIR32: [[META1:![0-9]+]] = !{i32 2, i32 0} diff --git a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl index a1408d38a44c9..e7f2469f968f9 100644 --- a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl +++ b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl @@ -39,7 +39,7 @@ void callee(int id, __global int *out) { out[id] = id; } -// COMMON-LABEL: define{{.*}} spir_kernel void @device_side_enqueue(ptr addrspace(1) align 4 %{{.*}}, ptr addrspace(1) align 4 %b, i32 %i) +// COMMON-LABEL: define{{.*}} spir_func void @__clang_ocl_kern_imp_device_side_enqueue(ptr addrspace(1) align 4 %{{.*}}, ptr addrspace(1) align 4 %b, i32 %i) kernel void device_side_enqueue(global int *a, global int *b, int i) { // SPIR: %default_queue = alloca target("spirv.Queue") // X86: %default_queue = alloca ptr diff --git a/clang/test/CodeGenOpenCL/convergent.cl b/clang/test/CodeGenOpenCL/convergent.cl index 123adba7b40d2..53a35a4f73119 100644 --- a/clang/test/CodeGenOpenCL/convergent.cl +++ b/clang/test/CodeGenOpenCL/convergent.cl @@ -127,7 +127,7 @@ void test_not_unroll() { // CHECK: declare spir_func void @nodupfun(){{[^#]*}} #[[attr3:[0-9]+]] // CHECK-LABEL: @assume_convergent_asm -// CHECK: tail call void asm sideeffect "s_barrier", ""() #5 +// CHECK: tail call void asm sideeffect "s_barrier", ""() #6 kernel void assume_convergent_asm() { __asm__ volatile("s_barrier"); @@ -138,6 +138,7 @@ kernel void assume_convergent_asm() // CHECK: attributes #2 = { {{[^}]*}}convergent{{[^}]*}} } // CHECK: attributes #3 = { {{[^}]*}}convergent noduplicate{{[^}]*}} } // CHECK: attributes #4 = { {{[^}]*}}convergent{{[^}]*}} } -// CHECK: attributes #5 = { {{[^}]*}}convergent{{[^}]*}} } -// CHECK: attributes #6 = { {{[^}]*}}nounwind{{[^}]*}} } -// CHECK: attributes #7 = { {{[^}]*}}convergent noduplicate nounwind{{[^}]*}} } +// CHECK: attributes #5 = { {{[^}]*}}alwaysinline convergent{{[^}]*}} } +// CHECK: attributes #6 = { {{[^}]*}}convergent{{[^}]*}} } +// CHECK: attributes #7 = { {{[^}]*}}nounwind{{[^}]*}} } +// CHECK: attributes #8 = { {{[^}]*}}convergent noduplicate nounwind{{[^}]*}} } diff --git a/clang/test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl b/clang/test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl index b1e45e6d6e6dc..8e970f121bca8 100644 --- a/clang/test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl +++ b/clang/test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl @@ -36,6 +36,6 @@ kernel void test(int i) { // Check that the temporary is scoped to the `if` // CHECK-DEBUG: ![[TESTFILE:[0-9]+]] = !DIFile(filename: "" -// CHECK-DEBUG: ![[TESTSCOPE:[0-9]+]] = distinct !DISubprogram(name: "test", {{.*}} file: ![[TESTFILE]] +// CHECK-DEBUG: ![[TESTSCOPE:[0-9]+]] = distinct !DISubprogram(name: "test", linkageName: "__clang_ocl_kern_imp_test", {{.*}} file: ![[TESTFILE]] // CHECK-DEBUG: ![[IFSCOPE:[0-9]+]] = distinct !DILexicalBlock(scope: ![[TESTSCOPE]], file: ![[TESTFILE]], line: 26) // CHECK-DEBUG: ![[TEMPLOCATION]] = !DILocation(line: 27, scope: ![[IFSCOPE]]) diff --git a/clang/test/CodeGenOpenCL/kernel-arg-info.cl b/clang/test/CodeGenOpenCL/kernel-arg-info.cl index dbb59af9470ce..3a2284223a105 100644 --- a/clang/test/CodeGenOpenCL/kernel-arg-info.cl +++ b/clang/test/CodeGenOpenCL/kernel-arg-info.cl @@ -21,6 +21,7 @@ kernel void foo(global int * globalintp, global int * restrict globalintrestrict *globalintrestrictp = constint + volatileint; } // CHECK: define{{.*}} spir_kernel void @foo{{[^!]+}} +// ARGINFO: define{{.*}} spir_kernel void @foo{{[^!]+}} // CHECK: !kernel_arg_addr_space ![[MD11:[0-9]+]] // CHECK: !kernel_arg_access_qual ![[MD12:[0-9]+]] // CHECK: !kernel_arg_type ![[MD13:[0-9]+]] @@ -32,6 +33,7 @@ kernel void foo(global int * globalintp, global int * restrict globalintrestrict kernel void foo2(read_only image1d_t img1, image2d_t img2, write_only image2d_array_t img3, read_write image1d_t img4) { } // CHECK: define{{.*}} spir_kernel void @foo2{{[^!]+}} +// ARGINFO: define{{.*}} spir_kernel void @foo2{{[^!]+}} // CHECK: !kernel_arg_addr_space ![[MD21:[0-9]+]] // CHECK: !kernel_arg_access_qual ![[MD22:[0-9]+]] // CHECK: !kernel_arg_type ![[MD23:[0-9]+]] @@ -43,6 +45,7 @@ kernel void foo2(read_only image1d_t img1, image2d_t img2, write_only image2d_ar kernel void foo3(__global half * X) { } // CHECK: define{{.*}} spir_kernel void @foo3{{[^!]+}} +// ARGINFO: define{{.*}} spir_kernel void @foo3{{[^!]+}} // CHECK: !kernel_arg_addr_space ![[MD31:[0-9]+]] // CHECK: !kernel_arg_access_qual ![[MD32:[0-9]+]] // CHECK: !kernel_arg_type ![[MD33:[0-9]+]] @@ -55,6 +58,7 @@ typedef unsigned int myunsignedint; kernel void foo4(__global unsigned int * X, __global myunsignedint * Y) { } // CHECK: define{{.*}} spir_kernel void @foo4{{[^!]+}} +// ARGINFO: define{{.*}} spir_kernel void @foo4{{[^!]+}} // CHECK: !kernel_arg_addr_space ![[MD41:[0-9]+]] // CHECK: !kernel_arg_access_qual ![[MD42:[0-9]+]] // CHECK: !kernel_arg_type ![[MD43:[0-9]+]] @@ -67,6 +71,7 @@ typedef image1d_t myImage; kernel void foo5(myImage img1, write_only image1d_t img2) { } // CHECK: define{{.*}} spir_kernel void @foo5{{[^!]+}} +// ARGINFO: define{{.*}} spir_kernel void @foo5{{[^!]+}} // CHECK: !kernel_arg_addr_space ![[MD41:[0-9]+]] // CHECK: !kernel_arg_access_qual ![[MD51:[0-9]+]] // CHECK: !kernel_arg_type ![[MD52:[0-9]+]] @@ -77,6 +82,8 @@ kernel void foo5(myImage img1, write_only image1d_t img2) { typedef char char16 __attribute__((ext_vector_type(16))); __kernel void foo6(__global char16 arg[]) {} +// CHECK: define{{.*}} spir_kernel void @foo6{{[^!]+}} +// ARGINFO: define{{.*}} spir_kernel void @foo6{{[^!]+}} // CHECK: !kernel_arg_type ![[MD61:[0-9]+]] // ARGINFO: !kernel_arg_name ![[MD62:[0-9]+]] @@ -87,6 +94,7 @@ kernel void foo7(ROImage ro, WOImage wo, RWImage rw) { } // CHECK: define{{.*}} spir_kernel void @foo7{{[^!]+}} +// ARGINFO: define{{.*}} spir_kernel void @foo7{{[^!]+}} // CHECK: !kernel_arg_addr_space ![[MD71:[0-9]+]] // CHECK: !kernel_arg_access_qual ![[MD72:[0-9]+]] // CHECK: !kernel_arg_type ![[MD73:[0-9]+]] @@ -99,6 +107,7 @@ typedef unsigned char uchar; typedef uchar uchar2 __attribute__((ext_vector_type(2))); kernel void foo8(pipe int p1, pipe uchar p2, pipe uchar2 p3, const pipe uchar p4, write_only pipe uchar p5) {} // CHECK: define{{.*}} spir_kernel void @foo8{{[^!]+}} +// ARGINFO: define{{.*}} spir_kernel void @foo8{{[^!]+}} // CHECK: !kernel_arg_addr_space ![[PIPE_AS_QUAL:[0-9]+]] // CHECK: !kernel_arg_access_qual ![[PIPE_ACCESS_QUAL:[0-9]+]] // CHECK: !kernel_arg_type ![[PIPE_TY:[0-9]+]] @@ -109,6 +118,7 @@ kernel void foo8(pipe int p1, pipe uchar p2, pipe uchar2 p3, const pipe uchar p4 kernel void foo9(signed char sc1, global const signed char* sc2) {} // CHECK: define{{.*}} spir_kernel void @foo9{{[^!]+}} +// ARGINFO: define{{.*}} spir_kernel void @foo9{{[^!]+}} // CHECK: !kernel_arg_addr_space ![[SCHAR_AS_QUAL:[0-9]+]] // CHECK: !kernel_arg_access_qual ![[MD42]] // CHECK: !kernel_arg_type ![[SCHAR_TY:[0-9]+]] diff --git a/clang/test/CodeGenOpenCL/opencl-kernel-call.cl b/clang/test/CodeGenOpenCL/opencl-kernel-call.cl index f575728f23763..aa6506de7899c 100644 --- a/clang/test/CodeGenOpenCL/opencl-kernel-call.cl +++ b/clang/test/CodeGenOpenCL/opencl-kernel-call.cl @@ -1,43 +1,104 @@ -// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -emit-llvm -o - %s | FileCheck %s +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-attributes --include-generated-funcs --version 5 +// RUN: %clang_cc1 -O0 -disable-llvm-passes -triple amdgcn-unknown-unknown -emit-llvm -o - %s | FileCheck %s -// CHECK: define dso_local amdgpu_kernel void @callee_kern({{.*}}) __attribute__((noinline)) kernel void callee_kern(global int *A){ *A = 1; } __attribute__((noinline)) kernel void ext_callee_kern(global int *A); -// CHECK: define dso_local void @callee_func({{.*}}) __attribute__((noinline)) void callee_func(global int *A){ *A = 2; } -// CHECK: define dso_local amdgpu_kernel void @caller_kern({{.*}}) kernel void caller_kern(global int* A){ callee_kern(A); - // CHECK: tail call void @__clang_ocl_kern_imp_callee_kern({{.*}}) ext_callee_kern(A); - // CHECK: tail call void @__clang_ocl_kern_imp_ext_callee_kern({{.*}}) callee_func(A); - // CHECK: tail call void @callee_func({{.*}}) } -// CHECK: define dso_local void @__clang_ocl_kern_imp_callee_kern({{.*}}) - -// CHECK: declare void @__clang_ocl_kern_imp_ext_callee_kern({{.*}}) - -// CHECK: define dso_local void @caller_func({{.*}}) void caller_func(global int* A){ callee_kern(A); - // CHECK: tail call void @__clang_ocl_kern_imp_callee_kern({{.*}}) #7 ext_callee_kern(A); - // CHECK: tail call void @__clang_ocl_kern_imp_ext_callee_kern({{.*}}) #8 callee_func(A); - // CHECK: tail call void @callee_func({{.*}}) } - -// CHECK: define dso_local void @__clang_ocl_kern_imp_caller_kern({{.*}}) -// CHECK: tail call void @__clang_ocl_kern_imp_callee_kern({{.*}}) -// CHECK: tail call void @__clang_ocl_kern_imp_ext_callee_kern({{.*}}) -// CHECK: tail call void @callee_func({{.*}}) +// CHECK: Function Attrs: convergent noinline norecurse nounwind optnone +// CHECK-LABEL: define dso_local amdgpu_kernel void @callee_kern( +// CHECK-SAME: ptr addrspace(1) noundef align 4 [[A:%.*]]) #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8 +// CHECK-NEXT: call void @__clang_ocl_kern_imp_callee_kern(ptr addrspace(1) noundef align 4 [[TMP0]]) #[[ATTR4:[0-9]+]] +// CHECK-NEXT: ret void +// +// +// CHECK: Function Attrs: alwaysinline convergent norecurse nounwind +// CHECK-LABEL: define dso_local void @__clang_ocl_kern_imp_callee_kern( +// CHECK-SAME: ptr addrspace(1) noundef align 4 [[A:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8 +// CHECK-NEXT: store i32 1, ptr addrspace(1) [[TMP0]], align 4 +// CHECK-NEXT: ret void +// +// +// CHECK: Function Attrs: convergent noinline norecurse nounwind optnone +// CHECK-LABEL: define dso_local void @callee_func( +// CHECK-SAME: ptr addrspace(1) noundef [[A:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8 +// CHECK-NEXT: store i32 2, ptr addrspace(1) [[TMP0]], align 4 +// CHECK-NEXT: ret void +// +// +// CHECK: Function Attrs: convergent noinline norecurse nounwind optnone +// CHECK-LABEL: define dso_local amdgpu_kernel void @caller_kern( +// CHECK-SAME: ptr addrspace(1) noundef align 4 [[A:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8 +// CHECK-NEXT: call void @__clang_ocl_kern_imp_caller_kern(ptr addrspace(1) noundef align 4 [[TMP0]]) #[[ATTR4]] +// CHECK-NEXT: ret void +// +// +// CHECK: Function Attrs: alwaysinline convergent norecurse nounwind +// CHECK-LABEL: define dso_local void @__clang_ocl_kern_imp_caller_kern( +// CHECK-SAME: ptr addrspace(1) noundef align 4 [[A:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8 +// CHECK-NEXT: call void @__clang_ocl_kern_imp_callee_kern(ptr addrspace(1) noundef align 4 [[TMP0]]) #[[ATTR4]] +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8 +// CHECK-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern(ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR4]] +// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8 +// CHECK-NEXT: call void @callee_func(ptr addrspace(1) noundef [[TMP2]]) #[[ATTR5:[0-9]+]] +// CHECK-NEXT: ret void +// +// +// CHECK: Function Attrs: convergent noinline norecurse nounwind optnone +// CHECK-LABEL: define dso_local void @caller_func( +// CHECK-SAME: ptr addrspace(1) noundef [[A:%.*]]) #[[ATTR2]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8 +// CHECK-NEXT: call void @__clang_ocl_kern_imp_callee_kern(ptr addrspace(1) noundef align 4 [[TMP0]]) #[[ATTR4]] +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8 +// CHECK-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern(ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR4]] +// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8 +// CHECK-NEXT: call void @callee_func(ptr addrspace(1) noundef [[TMP2]]) #[[ATTR5]] +// CHECK-NEXT: ret void +// +//. +// CHECK: [[META4]] = !{i32 1} +// CHECK: [[META5]] = !{!"none"} +// CHECK: [[META6]] = !{!"int*"} +// CHECK: [[META7]] = !{!""} +//. diff --git a/clang/test/CodeGenOpenCL/reflect.cl b/clang/test/CodeGenOpenCL/reflect.cl index f5b618f6a35d3..fe43187d6a3f1 100644 --- a/clang/test/CodeGenOpenCL/reflect.cl +++ b/clang/test/CodeGenOpenCL/reflect.cl @@ -15,12 +15,15 @@ bool device_function() { // CHECK-LABEL: define dso_local ptx_kernel void @kernel_function( // CHECK-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META5]] !kernel_arg_type_qual [[META6:![0-9]+]] { // CHECK-NEXT: entry: +// CHECK-NEXT: [[I_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4 // CHECK-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 4 // CHECK-NEXT: store ptr addrspace(1) [[I]], ptr [[I_ADDR]], align 4 -// CHECK-NEXT: [[CALL:%.*]] = call zeroext i1 @device_function() #[[ATTR3:[0-9]+]] -// CHECK-NEXT: [[CONV:%.*]] = zext i1 [[CALL]] to i32 // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[I_ADDR]], align 4 -// CHECK-NEXT: store i32 [[CONV]], ptr addrspace(1) [[TMP0]], align 4 +// CHECK-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[I_ADDR_I]], align 4 +// CHECK-NEXT: [[CALL_I:%.*]] = call zeroext i1 @device_function() #[[ATTR4:[0-9]+]] +// CHECK-NEXT: [[CONV_I:%.*]] = zext i1 [[CALL_I]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[I_ADDR_I]], align 4 +// CHECK-NEXT: store i32 [[CONV_I]], ptr addrspace(1) [[TMP1]], align 4 // CHECK-NEXT: ret void // __kernel void kernel_function(__global int *i) { diff --git a/clang/test/CodeGenOpenCL/spir-calling-conv.cl b/clang/test/CodeGenOpenCL/spir-calling-conv.cl index 6c8f20511b8bd..b2015ea0f729d 100644 --- a/clang/test/CodeGenOpenCL/spir-calling-conv.cl +++ b/clang/test/CodeGenOpenCL/spir-calling-conv.cl @@ -11,8 +11,8 @@ kernel void foo(global int *A) // CHECK: %{{[a-z0-9_]+}} = tail call spir_func i32 @get_dummy_id(i32 noundef 0) A[id] = id; bar(A); - // CHECK: tail call void @__clang_ocl_kern_imp_bar(ptr addrspace(1) noundef align 4 %A) + // CHECK: tail call spir_func void @__clang_ocl_kern_imp_bar(ptr addrspace(1) noundef align 4 %A) } // CHECK: declare spir_func i32 @get_dummy_id(i32 noundef) -// CHECK: declare void @__clang_ocl_kern_imp_bar(ptr addrspace(1) noundef align 4) +// CHECK: declare spir_func void @__clang_ocl_kern_imp_bar(ptr addrspace(1) noundef align 4) diff --git a/clang/test/CodeGenOpenCL/visibility.cl b/clang/test/CodeGenOpenCL/visibility.cl index e5dc5b29c5140..31807aacf8c2a 100644 --- a/clang/test/CodeGenOpenCL/visibility.cl +++ b/clang/test/CodeGenOpenCL/visibility.cl @@ -37,22 +37,33 @@ __attribute__((visibility("protected"))) extern int ext_protected; // FVIS-PROTECTED: @ext_default = external local_unnamed_addr // FVIS-HIDDEN: @ext_default = external local_unnamed_addr __attribute__((visibility("default"))) extern int ext_default; - // FVIS-DEFAULT: define{{.*}} amdgpu_kernel void @kern() // FVIS-PROTECTED: define protected amdgpu_kernel void @kern() // FVIS-HIDDEN: define protected amdgpu_kernel void @kern() +// FVIS-DEFAULT: define{{.*}} void @__clang_ocl_kern_imp_kern() +// FVIS-PROTECTED: define protected void @__clang_ocl_kern_imp_kern() +// FVIS-HIDDEN: define protected void @__clang_ocl_kern_imp_kern() kernel void kern() {} // FVIS-DEFAULT: define protected amdgpu_kernel void @kern_hidden() // FVIS-PROTECTED: define protected amdgpu_kernel void @kern_hidden() // FVIS-HIDDEN: define protected amdgpu_kernel void @kern_hidden() +// FVIS-DEFAULT: define protected void @__clang_ocl_kern_imp_kern_hidden() +// FVIS-PROTECTED: define protected void @__clang_ocl_kern_imp_kern_hidden() +// FVIS-HIDDEN: define protected void @__clang_ocl_kern_imp_kern_hidden() __attribute__((visibility("hidden"))) kernel void kern_hidden() {} // FVIS-DEFAULT: define protected amdgpu_kernel void @kern_protected() // FVIS-PROTECTED: define protected amdgpu_kernel void @kern_protected() // FVIS-HIDDEN: define protected amdgpu_kernel void @kern_protected() +// FVIS-DEFAULT: define protected void @__clang_ocl_kern_imp_kern_protected() +// FVIS-PROTECTED: define protected void @__clang_ocl_kern_imp_kern_protected() +// FVIS-HIDDEN: define protected void @__clang_ocl_kern_imp_kern_protected() __attribute__((visibility("protected"))) kernel void kern_protected() {} // FVIS-DEFAULT: define{{.*}} amdgpu_kernel void @kern_default() // FVIS-PROTECTED: define{{.*}} amdgpu_kernel void @kern_default() // FVIS-HIDDEN: define{{.*}} amdgpu_kernel void @kern_default() +// FVIS-DEFAULT: define{{.*}} void @__clang_ocl_kern_imp_kern_default() +// FVIS-PROTECTED: define{{.*}} void @__clang_ocl_kern_imp_kern_default() +// FVIS-HIDDEN: define{{.*}} void @__clang_ocl_kern_imp_kern_default() __attribute__((visibility("default"))) kernel void kern_default() {} // FVIS-DEFAULT: define{{.*}} void @func() @@ -138,18 +149,5 @@ void use() { // FVIS-PROTECTED: declare void @ext_func_default() // FVIS-HIDDEN: declare void @ext_func_default() -// FVIS-DEFAULT: define{{.*}} void @__clang_ocl_kern_imp_kern() -// FVIS-PROTECTED: define protected void @__clang_ocl_kern_imp_kern() -// FVIS-HIDDEN: define protected void @__clang_ocl_kern_imp_kern() -// FVIS-DEFAULT: define protected void @__clang_ocl_kern_imp_kern_hidden() -// FVIS-PROTECTED: define protected void @__clang_ocl_kern_imp_kern_hidden() -// FVIS-HIDDEN: define protected void @__clang_ocl_kern_imp_kern_hidden() -// FVIS-DEFAULT: define protected void @__clang_ocl_kern_imp_kern_protected() -// FVIS-PROTECTED: define protected void @__clang_ocl_kern_imp_kern_protected() -// FVIS-HIDDEN: define protected void @__clang_ocl_kern_imp_kern_protected() - -// FVIS-DEFAULT: define{{.*}} void @__clang_ocl_kern_imp_kern_default() -// FVIS-PROTECTED: define{{.*}} void @__clang_ocl_kern_imp_kern_default() -// FVIS-HIDDEN: define{{.*}} void @__clang_ocl_kern_imp_kern_default() diff --git a/clang/test/CodeGenOpenCLCXX/addrspace-of-this.clcpp b/clang/test/CodeGenOpenCLCXX/addrspace-of-this.clcpp index 2f1b6c196fd51..8395a40095b94 100644 --- a/clang/test/CodeGenOpenCLCXX/addrspace-of-this.clcpp +++ b/clang/test/CodeGenOpenCLCXX/addrspace-of-this.clcpp @@ -125,9 +125,6 @@ __kernel void test__global() { // EXPL: call spir_func void @_ZNU3AS41CC1EOU3AS4S_(ptr addrspace(4) {{[^,]*}} [[C5GEN]], ptr addrspace(4) noundef align 4 dereferenceable(4) [[CALL]]) // IMPL: call void @llvm.memcpy.p0.p4.i32(ptr {{.*}}%c5, ptr addrspace(4) {{.*}}[[CALL]] -// Tests address space of inline members -//COMMON: @_ZNU3AS41C3getEv(ptr addrspace(4) {{[^,]*}} %this) -//COMMON: @_ZNU3AS41CplERU3AS4KS_(ptr dead_on_unwind noalias writable sret(%class.C) align 4 %agg.result, ptr addrspace(4) {{[^,]*}} %this #define TEST(AS) \ __kernel void test##AS() { \ AS C c; \ @@ -190,6 +187,10 @@ TEST(__private) // EXPL: call spir_func noundef align 4 dereferenceable(4) ptr addrspace(4) @_ZNU3AS41CaSERU3AS4KS_(ptr addrspace(4) {{[^,]*}} [[C2GEN]], ptr addrspace(4) noundef align 4 dereferenceable(4) [[C1GEN]]) // IMPL: call void @llvm.memcpy.p4.p4.i32(ptr addrspace(4) {{.*}}[[C2GEN]], ptr addrspace(4) {{.*}}[[C1GEN]] +// Tests address space of inline members +//COMMON: @_ZNU3AS41C3getEv(ptr addrspace(4) {{[^,]*}} %this) +//COMMON: @_ZNU3AS41CplERU3AS4KS_(ptr dead_on_unwind noalias writable sret(%class.C) align 4 %agg.result, ptr addrspace(4) {{[^,]*}} %this + // Test that calling a const method from a non-const method does not crash Clang. class ConstAndNonConstMethod { public: From ae8f16104503588451f809f5dde435d3aae8a7c4 Mon Sep 17 00:00:00 2001 From: anikelal Date: Wed, 29 Jan 2025 12:11:52 +0530 Subject: [PATCH 4/8] Set correct calling convention for stubs in X86 and SPIR targets Test aggregate paratemers passed byref and byval --- clang/lib/CodeGen/TargetInfo.cpp | 6 + clang/lib/CodeGen/TargetInfo.h | 3 +- clang/lib/CodeGen/Targets/AMDGPU.cpp | 8 - clang/lib/CodeGen/Targets/X86.cpp | 6 - .../CodeGenOpenCL/cl20-device-side-enqueue.cl | 2 +- .../test/CodeGenOpenCL/opencl-kernel-call.cl | 1362 +++++++++++++++-- 6 files changed, 1282 insertions(+), 105 deletions(-) diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp index 64a9a5554caf7..981488eb4dc37 100644 --- a/clang/lib/CodeGen/TargetInfo.cpp +++ b/clang/lib/CodeGen/TargetInfo.cpp @@ -117,6 +117,12 @@ unsigned TargetCodeGenInfo::getOpenCLKernelCallingConv() const { return llvm::CallingConv::SPIR_KERNEL; } +void TargetCodeGenInfo::setOCLKernelStubCallingConvention( + const FunctionType *&FT) const { + FT = getABIInfo().getContext().adjustFunctionType( + FT, FT->getExtInfo().withCallingConv(CC_C)); +} + llvm::Constant *TargetCodeGenInfo::getNullPointer(const CodeGen::CodeGenModule &CGM, llvm::PointerType *T, QualType QT) const { return llvm::ConstantPointerNull::get(T); diff --git a/clang/lib/CodeGen/TargetInfo.h b/clang/lib/CodeGen/TargetInfo.h index 9149e2fe8a0ab..ef4e500f7a38c 100644 --- a/clang/lib/CodeGen/TargetInfo.h +++ b/clang/lib/CodeGen/TargetInfo.h @@ -400,8 +400,7 @@ class TargetCodeGenInfo { virtual bool shouldEmitDWARFBitFieldSeparators() const { return false; } virtual void setCUDAKernelCallingConvention(const FunctionType *&FT) const {} - virtual void - setOCLKernelStubCallingConvention(const FunctionType *&FT) const {} + virtual void setOCLKernelStubCallingConvention(const FunctionType *&FT) const; /// Return the device-side type for the CUDA device builtin surface type. virtual llvm::Type *getCUDADeviceBuiltinSurfaceDeviceType() const { // By default, no change from the original one. diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index 8e92c591671f8..db2a2c5740646 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -331,8 +331,6 @@ class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo { bool shouldEmitStaticExternCAliases() const override; bool shouldEmitDWARFBitFieldSeparators() const override; void setCUDAKernelCallingConvention(const FunctionType *&FT) const override; - void - setOCLKernelStubCallingConvention(const FunctionType *&FT) const override; }; } @@ -594,12 +592,6 @@ getAMDGPURuntimeHandleType(llvm::LLVMContext &C, "block.runtime.handle.t"); } -void AMDGPUTargetCodeGenInfo::setOCLKernelStubCallingConvention( - const FunctionType *&FT) const { - FT = getABIInfo().getContext().adjustFunctionType( - FT, FT->getExtInfo().withCallingConv(CC_C)); -} - /// Create an OpenCL kernel for an enqueued block. /// /// The type of the first argument (the block literal) is the struct type diff --git a/clang/lib/CodeGen/Targets/X86.cpp b/clang/lib/CodeGen/Targets/X86.cpp index 3799e6d7c2545..b36a6e1396653 100644 --- a/clang/lib/CodeGen/Targets/X86.cpp +++ b/clang/lib/CodeGen/Targets/X86.cpp @@ -1495,12 +1495,6 @@ class X86_64TargetCodeGenInfo : public TargetCodeGenInfo { const FunctionDecl *Caller, const FunctionDecl *Callee, const CallArgList &Args, QualType ReturnType) const override; - - void - setOCLKernelStubCallingConvention(const FunctionType *&FT) const override { - FT = getABIInfo().getContext().adjustFunctionType( - FT, FT->getExtInfo().withCallingConv(CC_SpirFunction)); - } }; } // namespace diff --git a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl index e7f2469f968f9..4809168b7fc8b 100644 --- a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl +++ b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl @@ -39,7 +39,7 @@ void callee(int id, __global int *out) { out[id] = id; } -// COMMON-LABEL: define{{.*}} spir_func void @__clang_ocl_kern_imp_device_side_enqueue(ptr addrspace(1) align 4 %{{.*}}, ptr addrspace(1) align 4 %b, i32 %i) +// COMMON-LABEL: define{{.*}} void @__clang_ocl_kern_imp_device_side_enqueue(ptr addrspace(1) align 4 %{{.*}}, ptr addrspace(1) align 4 %b, i32 %i) kernel void device_side_enqueue(global int *a, global int *b, int i) { // SPIR: %default_queue = alloca target("spirv.Queue") // X86: %default_queue = alloca ptr diff --git a/clang/test/CodeGenOpenCL/opencl-kernel-call.cl b/clang/test/CodeGenOpenCL/opencl-kernel-call.cl index aa6506de7899c..c08c56912644c 100644 --- a/clang/test/CodeGenOpenCL/opencl-kernel-call.cl +++ b/clang/test/CodeGenOpenCL/opencl-kernel-call.cl @@ -1,104 +1,1290 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-attributes --include-generated-funcs --version 5 -// RUN: %clang_cc1 -O0 -disable-llvm-passes -triple amdgcn-unknown-unknown -emit-llvm -o - %s | FileCheck %s +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-attributes --include-generated-funcs --version 4 +// RUN: %clang_cc1 -O0 -triple i686-pc-darwin -emit-llvm -o - %s | FileCheck -check-prefix=X86 %s +// RUN: %clang_cc1 -O0 -triple amdgcn -emit-llvm -o - %s | FileCheck -check-prefix=AMDGCN %s + +#pragma OPENCL EXTENSION __cl_clang_function_pointers : enable + +typedef int int2 __attribute__((ext_vector_type(2))); + +typedef struct { + int cells[9]; +} Mat3X3; + +typedef struct { + int cells[16]; +} Mat4X4; + +typedef struct { + int cells[1024]; +} Mat32X32; + +typedef struct { + int cells[4096]; +} Mat64X64; + +struct StructOneMember { + int2 x; +}; + +struct StructTwoMember { + int2 x; + int2 y; +}; + +struct LargeStructOneMember { + int2 x[100]; +}; + +struct LargeStructTwoMember { + int2 x[40]; + int2 y[20]; +}; + +Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) { + Mat4X4 out; + return out; +} + +Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) { + Mat64X64 out; + return out; +} + +void FuncOneMember(struct StructOneMember u) { + u.x = (int2)(0, 0); +} + +void FuncOneLargeMember(struct LargeStructOneMember u) { + u.x[0] = (int2)(0, 0); +} + +void FuncTwoMember(struct StructTwoMember u) { + u.y = (int2)(0, 0); +} + +void FuncLargeTwoMember(struct LargeStructTwoMember u) { + u.y[0] = (int2)(0, 0); +} __attribute__((noinline)) kernel void callee_kern(global int *A){ *A = 1; } -__attribute__((noinline)) kernel void ext_callee_kern(global int *A); +kernel void callee_kern_Mat3X3(global Mat3X3 *in, global Mat4X4 *out) { + out[0] = foo(in[1]); +} -__attribute__((noinline)) void callee_func(global int *A){ - *A = 2; +kernel void callee_kern_Mat32X32(global Mat32X32 *in, global Mat64X64 *out) { + out[0] = foo_large(in[1]); } -kernel void caller_kern(global int* A){ - callee_kern(A); - ext_callee_kern(A); - callee_func(A); +kernel void KernelOneMember(struct StructOneMember u) { + FuncOneMember(u); +} + +kernel void KernelLargeOneMember(struct LargeStructOneMember u) { + FuncOneLargeMember(u); +} + +kernel void KernelTwoMember(struct StructTwoMember u) { + FuncTwoMember(u); +} +kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { + FuncLargeTwoMember(u); } -void caller_func(global int* A){ +__attribute__((noinline)) kernel void ext_callee_kern(global int *A); + +kernel void ext_callee_kern_Mat3X3(global Mat3X3 *in, global Mat4X4 *out); + +kernel void ext_callee_kern_Mat32X32(global Mat32X32 *in, global Mat64X64 *out); + +kernel void ext_KernelOneMember(struct StructOneMember u); + +kernel void ext_KernelLargeOneMember(struct LargeStructOneMember u); + +kernel void ext_KernelTwoMember(struct StructTwoMember u); + +kernel void ext_KernelLargeTwoMember(struct LargeStructTwoMember u); + +kernel void caller_kern(global int* A, global Mat3X3 *mat3X3, global Mat4X4 *mat4X4, global Mat32X32 *mat32X32, global Mat64X64 *mat64X64){ callee_kern(A); ext_callee_kern(A); - callee_func(A); -} -// CHECK: Function Attrs: convergent noinline norecurse nounwind optnone -// CHECK-LABEL: define dso_local amdgpu_kernel void @callee_kern( -// CHECK-SAME: ptr addrspace(1) noundef align 4 [[A:%.*]]) #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8 -// CHECK-NEXT: call void @__clang_ocl_kern_imp_callee_kern(ptr addrspace(1) noundef align 4 [[TMP0]]) #[[ATTR4:[0-9]+]] -// CHECK-NEXT: ret void -// -// -// CHECK: Function Attrs: alwaysinline convergent norecurse nounwind -// CHECK-LABEL: define dso_local void @__clang_ocl_kern_imp_callee_kern( -// CHECK-SAME: ptr addrspace(1) noundef align 4 [[A:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8 -// CHECK-NEXT: store i32 1, ptr addrspace(1) [[TMP0]], align 4 -// CHECK-NEXT: ret void -// -// -// CHECK: Function Attrs: convergent noinline norecurse nounwind optnone -// CHECK-LABEL: define dso_local void @callee_func( -// CHECK-SAME: ptr addrspace(1) noundef [[A:%.*]]) #[[ATTR2:[0-9]+]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8 -// CHECK-NEXT: store i32 2, ptr addrspace(1) [[TMP0]], align 4 -// CHECK-NEXT: ret void -// -// -// CHECK: Function Attrs: convergent noinline norecurse nounwind optnone -// CHECK-LABEL: define dso_local amdgpu_kernel void @caller_kern( -// CHECK-SAME: ptr addrspace(1) noundef align 4 [[A:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8 -// CHECK-NEXT: call void @__clang_ocl_kern_imp_caller_kern(ptr addrspace(1) noundef align 4 [[TMP0]]) #[[ATTR4]] -// CHECK-NEXT: ret void -// -// -// CHECK: Function Attrs: alwaysinline convergent norecurse nounwind -// CHECK-LABEL: define dso_local void @__clang_ocl_kern_imp_caller_kern( -// CHECK-SAME: ptr addrspace(1) noundef align 4 [[A:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8 -// CHECK-NEXT: call void @__clang_ocl_kern_imp_callee_kern(ptr addrspace(1) noundef align 4 [[TMP0]]) #[[ATTR4]] -// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8 -// CHECK-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern(ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR4]] -// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8 -// CHECK-NEXT: call void @callee_func(ptr addrspace(1) noundef [[TMP2]]) #[[ATTR5:[0-9]+]] -// CHECK-NEXT: ret void -// -// -// CHECK: Function Attrs: convergent noinline norecurse nounwind optnone -// CHECK-LABEL: define dso_local void @caller_func( -// CHECK-SAME: ptr addrspace(1) noundef [[A:%.*]]) #[[ATTR2]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// CHECK-NEXT: store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8 -// CHECK-NEXT: call void @__clang_ocl_kern_imp_callee_kern(ptr addrspace(1) noundef align 4 [[TMP0]]) #[[ATTR4]] -// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8 -// CHECK-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern(ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR4]] -// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8 -// CHECK-NEXT: call void @callee_func(ptr addrspace(1) noundef [[TMP2]]) #[[ATTR5]] -// CHECK-NEXT: ret void + + callee_kern_Mat3X3(mat3X3, mat4X4); + callee_kern_Mat32X32(mat32X32, mat64X64); + ext_callee_kern_Mat3X3(mat3X3, mat4X4); + ext_callee_kern_Mat32X32(mat32X32, mat64X64); +} + +kernel void caller_kern2(struct StructOneMember structOneMem, global struct StructOneMember* global_structOneMem, struct StructTwoMember structTwoMem){ + KernelOneMember(structOneMem); + ext_KernelOneMember(structOneMem); + KernelTwoMember(structTwoMem); + ext_KernelTwoMember(structTwoMem); +} + +kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct LargeStructTwoMember largeStructTwoMem){ + KernelLargeOneMember(largeStructOneMem); + KernelLargeTwoMember(largeStructTwoMem); + ext_KernelLargeOneMember(largeStructOneMem); + ext_KernelLargeTwoMember(largeStructTwoMem); +} +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define void @foo( +// X86-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT4X4:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_MAT3X3:%.*]]) align 4 [[IN:%.*]]) #[[ATTR0:[0-9]+]] { +// X86-NEXT: entry: +// X86-NEXT: [[RESULT_PTR:%.*]] = alloca ptr, align 4 +// X86-NEXT: store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4 +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define void @foo_large( +// X86-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_MAT32X32:%.*]]) align 4 [[IN:%.*]]) #[[ATTR0]] { +// X86-NEXT: entry: +// X86-NEXT: [[RESULT_PTR:%.*]] = alloca ptr, align 4 +// X86-NEXT: store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4 +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define void @FuncOneMember( +// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { +// X86-NEXT: entry: +// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8 +// X86-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 8, i1 false) +// X86-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// X86-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// X86-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U]], i32 0, i32 0 +// X86-NEXT: store <2 x i32> [[TMP1]], ptr [[X]], align 8 +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define void @FuncOneLargeMember( +// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { +// X86-NEXT: entry: +// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8 +// X86-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 800, i1 false) +// X86-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// X86-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// X86-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U]], i32 0, i32 0 +// X86-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr [[X]], i32 0, i32 0 +// X86-NEXT: store <2 x i32> [[TMP1]], ptr [[ARRAYIDX]], align 8 +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define void @FuncTwoMember( +// X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { +// X86-NEXT: entry: +// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8 +// X86-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 16, i1 false) +// X86-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// X86-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// X86-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1 +// X86-NEXT: store <2 x i32> [[TMP1]], ptr [[Y]], align 8 +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define void @FuncLargeTwoMember( +// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { +// X86-NEXT: entry: +// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 +// X86-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 480, i1 false) +// X86-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// X86-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// X86-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1 +// X86-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr [[Y]], i32 0, i32 0 +// X86-NEXT: store <2 x i32> [[TMP1]], ptr [[ARRAYIDX]], align 8 +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define spir_kernel void @callee_kern( +// X86-SAME: ptr noundef align 4 [[A:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { +// X86-NEXT: entry: +// X86-NEXT: [[A_ADDR_I:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4 +// X86-NEXT: store ptr [[TMP0]], ptr [[A_ADDR_I]], align 4 +// X86-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR_I]], align 4 +// X86-NEXT: store i32 1, ptr [[TMP1]], align 4 +// X86-NEXT: ret void +// +// +// X86: Function Attrs: alwaysinline convergent norecurse nounwind +// X86-LABEL: define void @__clang_ocl_kern_imp_callee_kern( +// X86-SAME: ptr noundef align 4 [[A:%.*]]) #[[ATTR3:[0-9]+]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] { +// X86-NEXT: entry: +// X86-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4 +// X86-NEXT: store i32 1, ptr [[TMP0]], align 4 +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define spir_kernel void @callee_kern_Mat3X3( +// X86-SAME: ptr noundef align 4 [[IN:%.*]], ptr noundef align 4 [[OUT:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META8:![0-9]+]] !kernel_arg_access_qual [[META9:![0-9]+]] !kernel_arg_type [[META10:![0-9]+]] !kernel_arg_base_type [[META10]] !kernel_arg_type_qual [[META11:![0-9]+]] { +// X86-NEXT: entry: +// X86-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4 +// X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: store ptr [[IN]], ptr [[IN_ADDR]], align 4 +// X86-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[IN_ADDR]], align 4 +// X86-NEXT: [[TMP1:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4 +// X86-NEXT: store ptr [[TMP0]], ptr [[IN_ADDR_I]], align 4 +// X86-NEXT: store ptr [[TMP1]], ptr [[OUT_ADDR_I]], align 4 +// X86-NEXT: [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR_I]], align 4 +// X86-NEXT: [[TMP3:%.*]] = load ptr, ptr [[IN_ADDR_I]], align 4 +// X86-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr [[TMP3]], i32 1 +// X86-NEXT: call void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP_I]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[ARRAYIDX1_I]]) #[[ATTR5:[0-9]+]] +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP2]], ptr align 4 [[TMP_I]], i32 64, i1 false) +// X86-NEXT: ret void +// +// +// X86: Function Attrs: alwaysinline convergent norecurse nounwind +// X86-LABEL: define void @__clang_ocl_kern_imp_callee_kern_Mat3X3( +// X86-SAME: ptr noundef align 4 [[IN:%.*]], ptr noundef align 4 [[OUT:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META8]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META10]] !kernel_arg_base_type [[META10]] !kernel_arg_type_qual [[META11]] { +// X86-NEXT: entry: +// X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4 +// X86-NEXT: store ptr [[IN]], ptr [[IN_ADDR]], align 4 +// X86-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4 +// X86-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr [[TMP0]], i32 0 +// X86-NEXT: [[TMP1:%.*]] = load ptr, ptr [[IN_ADDR]], align 4 +// X86-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr [[TMP1]], i32 1 +// X86-NEXT: call void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[ARRAYIDX1]]) #[[ATTR5]] +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 64, i1 false) +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define spir_kernel void @callee_kern_Mat32X32( +// X86-SAME: ptr noundef align 4 [[IN:%.*]], ptr noundef align 4 [[OUT:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META8]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META11]] { +// X86-NEXT: entry: +// X86-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4 +// X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: store ptr [[IN]], ptr [[IN_ADDR]], align 4 +// X86-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[IN_ADDR]], align 4 +// X86-NEXT: [[TMP1:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4 +// X86-NEXT: store ptr [[TMP0]], ptr [[IN_ADDR_I]], align 4 +// X86-NEXT: store ptr [[TMP1]], ptr [[OUT_ADDR_I]], align 4 +// X86-NEXT: [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR_I]], align 4 +// X86-NEXT: [[TMP3:%.*]] = load ptr, ptr [[IN_ADDR_I]], align 4 +// X86-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32:%.*]], ptr [[TMP3]], i32 1 +// X86-NEXT: call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_I]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[ARRAYIDX1_I]]) #[[ATTR5]] +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP2]], ptr align 4 [[TMP_I]], i32 16384, i1 false) +// X86-NEXT: ret void +// +// +// X86: Function Attrs: alwaysinline convergent norecurse nounwind +// X86-LABEL: define void @__clang_ocl_kern_imp_callee_kern_Mat32X32( +// X86-SAME: ptr noundef align 4 [[IN:%.*]], ptr noundef align 4 [[OUT:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META8]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META11]] { +// X86-NEXT: entry: +// X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4 +// X86-NEXT: store ptr [[IN]], ptr [[IN_ADDR]], align 4 +// X86-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4 +// X86-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr [[TMP0]], i32 0 +// X86-NEXT: [[TMP1:%.*]] = load ptr, ptr [[IN_ADDR]], align 4 +// X86-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32:%.*]], ptr [[TMP1]], i32 1 +// X86-NEXT: call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[ARRAYIDX1]]) #[[ATTR5]] +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 16384, i1 false) +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define spir_kernel void @KernelOneMember( +// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13:![0-9]+]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META14:![0-9]+]] !kernel_arg_base_type [[META14]] !kernel_arg_type_qual [[META7]] { +// X86-NEXT: entry: +// X86-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8 +// X86-NEXT: [[U1:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[U1]], ptr align 1 [[U]], i64 8, i1 false) +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I]], ptr align 4 [[U1]], i32 8, i1 false) +// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U_I]]) #[[ATTR5]] +// X86-NEXT: ret void +// +// +// X86: Function Attrs: alwaysinline convergent norecurse nounwind +// X86-LABEL: define void @__clang_ocl_kern_imp_KernelOneMember( +// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META14]] !kernel_arg_base_type [[META14]] !kernel_arg_type_qual [[META7]] { +// X86-NEXT: entry: +// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 8, i1 false) +// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR5]] +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define spir_kernel void @KernelLargeOneMember( +// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META7]] { +// X86-NEXT: entry: +// X86-NEXT: [[U_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8 +// X86-NEXT: [[U1:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[U1]], ptr align 1 [[U]], i64 800, i1 false) +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I]], ptr align 4 [[U1]], i32 800, i1 false) +// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U_I]]) #[[ATTR5]] +// X86-NEXT: ret void +// +// +// X86: Function Attrs: alwaysinline convergent norecurse nounwind +// X86-LABEL: define void @__clang_ocl_kern_imp_KernelLargeOneMember( +// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META15]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META7]] { +// X86-NEXT: entry: +// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 800, i1 false) +// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR5]] +// X86-NEXT: ret void // +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define spir_kernel void @KernelTwoMember( +// X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META7]] { +// X86-NEXT: entry: +// X86-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8 +// X86-NEXT: [[U1:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[U1]], ptr align 1 [[U]], i64 16, i1 false) +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I]], ptr align 4 [[U1]], i32 16, i1 false) +// X86-NEXT: call void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U_I]]) #[[ATTR5]] +// X86-NEXT: ret void +// +// +// X86: Function Attrs: alwaysinline convergent norecurse nounwind +// X86-LABEL: define void @__clang_ocl_kern_imp_KernelTwoMember( +// X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META16]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META7]] { +// X86-NEXT: entry: +// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 16, i1 false) +// X86-NEXT: call void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR5]] +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define spir_kernel void @KernelLargeTwoMember( +// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META7]] { +// X86-NEXT: entry: +// X86-NEXT: [[U_I:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 +// X86-NEXT: [[U1:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[U1]], ptr align 1 [[U]], i64 480, i1 false) +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I]], ptr align 4 [[U1]], i32 480, i1 false) +// X86-NEXT: call void @FuncLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[U_I]]) #[[ATTR5]] +// X86-NEXT: ret void +// +// +// X86: Function Attrs: alwaysinline convergent norecurse nounwind +// X86-LABEL: define void @__clang_ocl_kern_imp_KernelLargeTwoMember( +// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META17]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META7]] { +// X86-NEXT: entry: +// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 480, i1 false) +// X86-NEXT: call void @FuncLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR5]] +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define spir_kernel void @caller_kern( +// X86-SAME: ptr noundef align 4 [[A:%.*]], ptr noundef align 4 [[MAT3X3:%.*]], ptr noundef align 4 [[MAT4X4:%.*]], ptr noundef align 4 [[MAT32X32:%.*]], ptr noundef align 4 [[MAT64X64:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META18:![0-9]+]] !kernel_arg_access_qual [[META19:![0-9]+]] !kernel_arg_type [[META20:![0-9]+]] !kernel_arg_base_type [[META20]] !kernel_arg_type_qual [[META21:![0-9]+]] { +// X86-NEXT: entry: +// X86-NEXT: [[IN_ADDR_I1_I:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[OUT_ADDR_I2_I:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[TMP_I3_I:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4 +// X86-NEXT: [[IN_ADDR_I_I:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[OUT_ADDR_I_I:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[TMP_I_I:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4 +// X86-NEXT: [[A_ADDR_I_I:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[A_ADDR_I:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[MAT3X3_ADDR_I:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[MAT4X4_ADDR_I:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[MAT32X32_ADDR_I:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[MAT64X64_ADDR_I:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[MAT3X3_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[MAT4X4_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[MAT32X32_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[MAT64X64_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4 +// X86-NEXT: store ptr [[MAT3X3]], ptr [[MAT3X3_ADDR]], align 4 +// X86-NEXT: store ptr [[MAT4X4]], ptr [[MAT4X4_ADDR]], align 4 +// X86-NEXT: store ptr [[MAT32X32]], ptr [[MAT32X32_ADDR]], align 4 +// X86-NEXT: store ptr [[MAT64X64]], ptr [[MAT64X64_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4 +// X86-NEXT: [[TMP1:%.*]] = load ptr, ptr [[MAT3X3_ADDR]], align 4 +// X86-NEXT: [[TMP2:%.*]] = load ptr, ptr [[MAT4X4_ADDR]], align 4 +// X86-NEXT: [[TMP3:%.*]] = load ptr, ptr [[MAT32X32_ADDR]], align 4 +// X86-NEXT: [[TMP4:%.*]] = load ptr, ptr [[MAT64X64_ADDR]], align 4 +// X86-NEXT: store ptr [[TMP0]], ptr [[A_ADDR_I]], align 4 +// X86-NEXT: store ptr [[TMP1]], ptr [[MAT3X3_ADDR_I]], align 4 +// X86-NEXT: store ptr [[TMP2]], ptr [[MAT4X4_ADDR_I]], align 4 +// X86-NEXT: store ptr [[TMP3]], ptr [[MAT32X32_ADDR_I]], align 4 +// X86-NEXT: store ptr [[TMP4]], ptr [[MAT64X64_ADDR_I]], align 4 +// X86-NEXT: [[TMP5:%.*]] = load ptr, ptr [[A_ADDR_I]], align 4 +// X86-NEXT: store ptr [[TMP5]], ptr [[A_ADDR_I_I]], align 4 +// X86-NEXT: [[TMP6:%.*]] = load ptr, ptr [[A_ADDR_I_I]], align 4 +// X86-NEXT: store i32 1, ptr [[TMP6]], align 4 +// X86-NEXT: [[TMP7:%.*]] = load ptr, ptr [[A_ADDR_I]], align 4 +// X86-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern(ptr noundef align 4 [[TMP7]]) #[[ATTR6:[0-9]+]] +// X86-NEXT: [[TMP8:%.*]] = load ptr, ptr [[MAT3X3_ADDR_I]], align 4 +// X86-NEXT: [[TMP9:%.*]] = load ptr, ptr [[MAT4X4_ADDR_I]], align 4 +// X86-NEXT: store ptr [[TMP8]], ptr [[IN_ADDR_I_I]], align 4 +// X86-NEXT: store ptr [[TMP9]], ptr [[OUT_ADDR_I_I]], align 4 +// X86-NEXT: [[TMP10:%.*]] = load ptr, ptr [[OUT_ADDR_I_I]], align 4 +// X86-NEXT: [[TMP11:%.*]] = load ptr, ptr [[IN_ADDR_I_I]], align 4 +// X86-NEXT: [[ARRAYIDX1_I_I:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr [[TMP11]], i32 1 +// X86-NEXT: call void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP_I_I]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[ARRAYIDX1_I_I]]) #[[ATTR5]] +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP10]], ptr align 4 [[TMP_I_I]], i32 64, i1 false) +// X86-NEXT: [[TMP12:%.*]] = load ptr, ptr [[MAT32X32_ADDR_I]], align 4 +// X86-NEXT: [[TMP13:%.*]] = load ptr, ptr [[MAT64X64_ADDR_I]], align 4 +// X86-NEXT: store ptr [[TMP12]], ptr [[IN_ADDR_I1_I]], align 4 +// X86-NEXT: store ptr [[TMP13]], ptr [[OUT_ADDR_I2_I]], align 4 +// X86-NEXT: [[TMP14:%.*]] = load ptr, ptr [[OUT_ADDR_I2_I]], align 4 +// X86-NEXT: [[TMP15:%.*]] = load ptr, ptr [[IN_ADDR_I1_I]], align 4 +// X86-NEXT: [[ARRAYIDX1_I4_I:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32:%.*]], ptr [[TMP15]], i32 1 +// X86-NEXT: call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_I3_I]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[ARRAYIDX1_I4_I]]) #[[ATTR5]] +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP14]], ptr align 4 [[TMP_I3_I]], i32 16384, i1 false) +// X86-NEXT: [[TMP16:%.*]] = load ptr, ptr [[MAT3X3_ADDR_I]], align 4 +// X86-NEXT: [[TMP17:%.*]] = load ptr, ptr [[MAT4X4_ADDR_I]], align 4 +// X86-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern_Mat3X3(ptr noundef align 4 [[TMP16]], ptr noundef align 4 [[TMP17]]) #[[ATTR6]] +// X86-NEXT: [[TMP18:%.*]] = load ptr, ptr [[MAT32X32_ADDR_I]], align 4 +// X86-NEXT: [[TMP19:%.*]] = load ptr, ptr [[MAT64X64_ADDR_I]], align 4 +// X86-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern_Mat32X32(ptr noundef align 4 [[TMP18]], ptr noundef align 4 [[TMP19]]) #[[ATTR6]] +// X86-NEXT: ret void +// +// +// X86: Function Attrs: alwaysinline convergent norecurse nounwind +// X86-LABEL: define void @__clang_ocl_kern_imp_caller_kern( +// X86-SAME: ptr noundef align 4 [[A:%.*]], ptr noundef align 4 [[MAT3X3:%.*]], ptr noundef align 4 [[MAT4X4:%.*]], ptr noundef align 4 [[MAT32X32:%.*]], ptr noundef align 4 [[MAT64X64:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META18]] !kernel_arg_access_qual [[META19]] !kernel_arg_type [[META20]] !kernel_arg_base_type [[META20]] !kernel_arg_type_qual [[META21]] { +// X86-NEXT: entry: +// X86-NEXT: [[IN_ADDR_I1:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[OUT_ADDR_I2:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[TMP_I3:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4 +// X86-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4 +// X86-NEXT: [[A_ADDR_I:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[MAT3X3_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[MAT4X4_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[MAT32X32_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[MAT64X64_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4 +// X86-NEXT: store ptr [[MAT3X3]], ptr [[MAT3X3_ADDR]], align 4 +// X86-NEXT: store ptr [[MAT4X4]], ptr [[MAT4X4_ADDR]], align 4 +// X86-NEXT: store ptr [[MAT32X32]], ptr [[MAT32X32_ADDR]], align 4 +// X86-NEXT: store ptr [[MAT64X64]], ptr [[MAT64X64_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4 +// X86-NEXT: store ptr [[TMP0]], ptr [[A_ADDR_I]], align 4 +// X86-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR_I]], align 4 +// X86-NEXT: store i32 1, ptr [[TMP1]], align 4 +// X86-NEXT: [[TMP2:%.*]] = load ptr, ptr [[A_ADDR]], align 4 +// X86-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern(ptr noundef align 4 [[TMP2]]) #[[ATTR6]] +// X86-NEXT: [[TMP3:%.*]] = load ptr, ptr [[MAT3X3_ADDR]], align 4 +// X86-NEXT: [[TMP4:%.*]] = load ptr, ptr [[MAT4X4_ADDR]], align 4 +// X86-NEXT: store ptr [[TMP3]], ptr [[IN_ADDR_I]], align 4 +// X86-NEXT: store ptr [[TMP4]], ptr [[OUT_ADDR_I]], align 4 +// X86-NEXT: [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_I]], align 4 +// X86-NEXT: [[TMP6:%.*]] = load ptr, ptr [[IN_ADDR_I]], align 4 +// X86-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr [[TMP6]], i32 1 +// X86-NEXT: call void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP_I]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[ARRAYIDX1_I]]) #[[ATTR5]] +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP5]], ptr align 4 [[TMP_I]], i32 64, i1 false) +// X86-NEXT: [[TMP7:%.*]] = load ptr, ptr [[MAT32X32_ADDR]], align 4 +// X86-NEXT: [[TMP8:%.*]] = load ptr, ptr [[MAT64X64_ADDR]], align 4 +// X86-NEXT: store ptr [[TMP7]], ptr [[IN_ADDR_I1]], align 4 +// X86-NEXT: store ptr [[TMP8]], ptr [[OUT_ADDR_I2]], align 4 +// X86-NEXT: [[TMP9:%.*]] = load ptr, ptr [[OUT_ADDR_I2]], align 4 +// X86-NEXT: [[TMP10:%.*]] = load ptr, ptr [[IN_ADDR_I1]], align 4 +// X86-NEXT: [[ARRAYIDX1_I4:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32:%.*]], ptr [[TMP10]], i32 1 +// X86-NEXT: call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_I3]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[ARRAYIDX1_I4]]) #[[ATTR5]] +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP9]], ptr align 4 [[TMP_I3]], i32 16384, i1 false) +// X86-NEXT: [[TMP11:%.*]] = load ptr, ptr [[MAT3X3_ADDR]], align 4 +// X86-NEXT: [[TMP12:%.*]] = load ptr, ptr [[MAT4X4_ADDR]], align 4 +// X86-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern_Mat3X3(ptr noundef align 4 [[TMP11]], ptr noundef align 4 [[TMP12]]) #[[ATTR6]] +// X86-NEXT: [[TMP13:%.*]] = load ptr, ptr [[MAT32X32_ADDR]], align 4 +// X86-NEXT: [[TMP14:%.*]] = load ptr, ptr [[MAT64X64_ADDR]], align 4 +// X86-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern_Mat32X32(ptr noundef align 4 [[TMP13]], ptr noundef align 4 [[TMP14]]) #[[ATTR6]] +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define spir_kernel void @caller_kern2( +// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 8 [[STRUCTONEMEM:%.*]], ptr noundef align 8 [[GLOBAL_STRUCTONEMEM:%.*]], ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[STRUCTTWOMEM:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META22:![0-9]+]] !kernel_arg_access_qual [[META23:![0-9]+]] !kernel_arg_type [[META24:![0-9]+]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25:![0-9]+]] { +// X86-NEXT: entry: +// X86-NEXT: [[U_I4_I:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8 +// X86-NEXT: [[STRUCTTWOMEM3_I:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8 +// X86-NEXT: [[U_I_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8 +// X86-NEXT: [[STRUCTONEMEM2_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8 +// X86-NEXT: [[STRUCTONEMEM_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8 +// X86-NEXT: [[STRUCTTWOMEM_I:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8 +// X86-NEXT: [[GLOBAL_STRUCTONEMEM_ADDR_I:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[STRUCTTWOMEM2:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8 +// X86-NEXT: [[STRUCTONEMEM1:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8 +// X86-NEXT: [[GLOBAL_STRUCTONEMEM_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: store ptr [[GLOBAL_STRUCTONEMEM]], ptr [[GLOBAL_STRUCTONEMEM_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[GLOBAL_STRUCTONEMEM_ADDR]], align 4 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[STRUCTTWOMEM2]], ptr align 1 [[STRUCTTWOMEM]], i64 16, i1 false) +// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[STRUCTONEMEM1]], ptr align 1 [[STRUCTONEMEM]], i64 8, i1 false) +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[STRUCTONEMEM_I]], ptr align 4 [[STRUCTONEMEM1]], i32 8, i1 false) +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[STRUCTTWOMEM_I]], ptr align 4 [[STRUCTTWOMEM2]], i32 16, i1 false) +// X86-NEXT: store ptr [[TMP0]], ptr [[GLOBAL_STRUCTONEMEM_ADDR_I]], align 4 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[STRUCTONEMEM2_I]], ptr align 1 [[STRUCTONEMEM_I]], i64 8, i1 false) +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I_I]], ptr align 4 [[STRUCTONEMEM2_I]], i32 8, i1 false) +// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U_I_I]]) #[[ATTR5]] +// X86-NEXT: call void @__clang_ocl_kern_imp_ext_KernelOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[STRUCTONEMEM_I]]) #[[ATTR6]] +// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[STRUCTTWOMEM3_I]], ptr align 1 [[STRUCTTWOMEM_I]], i64 16, i1 false) +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I4_I]], ptr align 4 [[STRUCTTWOMEM3_I]], i32 16, i1 false) +// X86-NEXT: call void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U_I4_I]]) #[[ATTR5]] +// X86-NEXT: call void @__clang_ocl_kern_imp_ext_KernelTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[STRUCTTWOMEM_I]]) #[[ATTR6]] +// X86-NEXT: ret void +// +// +// X86: Function Attrs: alwaysinline convergent norecurse nounwind +// X86-LABEL: define void @__clang_ocl_kern_imp_caller_kern2( +// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]], ptr noundef align 8 [[GLOBAL_STRUCTONEMEM:%.*]], ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 4 [[TMP1:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META22]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META24]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25]] { +// X86-NEXT: entry: +// X86-NEXT: [[U_I4:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8 +// X86-NEXT: [[STRUCTTWOMEM3:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8 +// X86-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8 +// X86-NEXT: [[STRUCTONEMEM2:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8 +// X86-NEXT: [[STRUCTONEMEM:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8 +// X86-NEXT: [[STRUCTTWOMEM:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8 +// X86-NEXT: [[GLOBAL_STRUCTONEMEM_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[STRUCTONEMEM]], ptr align 4 [[TMP0]], i32 8, i1 false) +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[STRUCTTWOMEM]], ptr align 4 [[TMP1]], i32 16, i1 false) +// X86-NEXT: store ptr [[GLOBAL_STRUCTONEMEM]], ptr [[GLOBAL_STRUCTONEMEM_ADDR]], align 4 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[STRUCTONEMEM2]], ptr align 1 [[STRUCTONEMEM]], i64 8, i1 false) +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I]], ptr align 4 [[STRUCTONEMEM2]], i32 8, i1 false) +// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U_I]]) #[[ATTR5]] +// X86-NEXT: call void @__clang_ocl_kern_imp_ext_KernelOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[STRUCTONEMEM]]) #[[ATTR6]] +// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[STRUCTTWOMEM3]], ptr align 1 [[STRUCTTWOMEM]], i64 16, i1 false) +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I4]], ptr align 4 [[STRUCTTWOMEM3]], i32 16, i1 false) +// X86-NEXT: call void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U_I4]]) #[[ATTR5]] +// X86-NEXT: call void @__clang_ocl_kern_imp_ext_KernelTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[STRUCTTWOMEM]]) #[[ATTR6]] +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define spir_kernel void @caller_kern3( +// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[LARGESTRUCTONEMEM:%.*]], ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[LARGESTRUCTTWOMEM:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META26:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META27:![0-9]+]] !kernel_arg_base_type [[META27]] !kernel_arg_type_qual [[META11]] { +// X86-NEXT: entry: +// X86-NEXT: [[U_I4_I:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 +// X86-NEXT: [[LARGESTRUCTTWOMEM3_I:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 +// X86-NEXT: [[U_I_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8 +// X86-NEXT: [[LARGESTRUCTONEMEM2_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8 +// X86-NEXT: [[LARGESTRUCTONEMEM_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8 +// X86-NEXT: [[LARGESTRUCTTWOMEM_I:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 +// X86-NEXT: [[LARGESTRUCTTWOMEM2:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 +// X86-NEXT: [[LARGESTRUCTONEMEM1:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[LARGESTRUCTTWOMEM2]], ptr align 1 [[LARGESTRUCTTWOMEM]], i64 480, i1 false) +// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[LARGESTRUCTONEMEM1]], ptr align 1 [[LARGESTRUCTONEMEM]], i64 800, i1 false) +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[LARGESTRUCTONEMEM_I]], ptr align 4 [[LARGESTRUCTONEMEM1]], i32 800, i1 false) +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[LARGESTRUCTTWOMEM_I]], ptr align 4 [[LARGESTRUCTTWOMEM2]], i32 480, i1 false) +// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[LARGESTRUCTONEMEM2_I]], ptr align 1 [[LARGESTRUCTONEMEM_I]], i64 800, i1 false) +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I_I]], ptr align 4 [[LARGESTRUCTONEMEM2_I]], i32 800, i1 false) +// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U_I_I]]) #[[ATTR5]] +// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[LARGESTRUCTTWOMEM3_I]], ptr align 1 [[LARGESTRUCTTWOMEM_I]], i64 480, i1 false) +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I4_I]], ptr align 4 [[LARGESTRUCTTWOMEM3_I]], i32 480, i1 false) +// X86-NEXT: call void @FuncLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[U_I4_I]]) #[[ATTR5]] +// X86-NEXT: call void @__clang_ocl_kern_imp_ext_KernelLargeOneMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[LARGESTRUCTONEMEM_I]]) #[[ATTR6]] +// X86-NEXT: call void @__clang_ocl_kern_imp_ext_KernelLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[LARGESTRUCTTWOMEM_I]]) #[[ATTR6]] +// X86-NEXT: ret void +// +// +// X86: Function Attrs: alwaysinline convergent norecurse nounwind +// X86-LABEL: define void @__clang_ocl_kern_imp_caller_kern3( +// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]], ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 4 [[TMP1:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META26]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META27]] !kernel_arg_base_type [[META27]] !kernel_arg_type_qual [[META11]] { +// X86-NEXT: entry: +// X86-NEXT: [[U_I4:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 +// X86-NEXT: [[LARGESTRUCTTWOMEM3:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 +// X86-NEXT: [[U_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8 +// X86-NEXT: [[LARGESTRUCTONEMEM2:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8 +// X86-NEXT: [[LARGESTRUCTONEMEM:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8 +// X86-NEXT: [[LARGESTRUCTTWOMEM:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[LARGESTRUCTONEMEM]], ptr align 4 [[TMP0]], i32 800, i1 false) +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[LARGESTRUCTTWOMEM]], ptr align 4 [[TMP1]], i32 480, i1 false) +// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[LARGESTRUCTONEMEM2]], ptr align 1 [[LARGESTRUCTONEMEM]], i64 800, i1 false) +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I]], ptr align 4 [[LARGESTRUCTONEMEM2]], i32 800, i1 false) +// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U_I]]) #[[ATTR5]] +// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[LARGESTRUCTTWOMEM3]], ptr align 1 [[LARGESTRUCTTWOMEM]], i64 480, i1 false) +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I4]], ptr align 4 [[LARGESTRUCTTWOMEM3]], i32 480, i1 false) +// X86-NEXT: call void @FuncLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[U_I4]]) #[[ATTR5]] +// X86-NEXT: call void @__clang_ocl_kern_imp_ext_KernelLargeOneMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[LARGESTRUCTONEMEM]]) #[[ATTR6]] +// X86-NEXT: call void @__clang_ocl_kern_imp_ext_KernelLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[LARGESTRUCTTWOMEM]]) #[[ATTR6]] +// X86-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local %struct.Mat4X4 @foo( +// AMDGCN-SAME: [9 x i32] [[IN_COERCE:%.*]]) #[[ATTR0:[0-9]+]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(5) [[IN]], i32 0, i32 0 +// AMDGCN-NEXT: store [9 x i32] [[IN_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 4 +// AMDGCN-NEXT: [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr addrspace(5) [[RETVAL]], align 4 +// AMDGCN-NEXT: ret [[STRUCT_MAT4X4]] [[TMP0]] +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local void @foo_large( +// AMDGCN-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false) +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local void @FuncOneMember( +// AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR0]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 +// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[TMP0]], ptr addrspace(5) [[X]], align 8 +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local void @FuncOneLargeMember( +// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) +// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr addrspace(5) [[X]], i64 0, i64 0 +// AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local void @FuncTwoMember( +// AMDGCN-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR0]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8 +// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(5) [[Y]], align 8 +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local void @FuncLargeTwoMember( +// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false) +// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr addrspace(5) [[Y]], i64 0, i64 0 +// AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @callee_kern( +// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[A:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[A_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[TMP0]], ptr addrspace(5) [[A_ADDR_I]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR_I]], align 8 +// AMDGCN-NEXT: store i32 1, ptr addrspace(1) [[TMP1]], align 4 +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: alwaysinline convergent norecurse nounwind +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_callee_kern( +// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[A:%.*]]) #[[ATTR3:[0-9]+]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8 +// AMDGCN-NEXT: store i32 1, ptr addrspace(1) [[TMP0]], align 4 +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @callee_kern_Mat3X3( +// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META8:![0-9]+]] !kernel_arg_access_qual [[META9:![0-9]+]] !kernel_arg_type [[META10:![0-9]+]] !kernel_arg_base_type [[META10]] !kernel_arg_type_qual [[META11:![0-9]+]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[TMP0]], ptr addrspace(5) [[IN_ADDR_I]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[TMP1]], ptr addrspace(5) [[OUT_ADDR_I]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR_I]], align 8 +// AMDGCN-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR_I]], align 8 +// AMDGCN-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP3]], i64 1 +// AMDGCN-NEXT: [[TMP4:%.*]] = load [9 x i32], ptr addrspace(1) [[ARRAYIDX1_I]], align 4 +// AMDGCN-NEXT: [[CALL_I:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP4]]) #[[ATTR5:[0-9]+]] +// AMDGCN-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL_I]], 0 +// AMDGCN-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP_I]], align 4 +// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[TMP2]], ptr addrspace(5) align 4 [[TMP_I]], i64 64, i1 false) +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: alwaysinline convergent norecurse nounwind +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_callee_kern_Mat3X3( +// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META8]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META10]] !kernel_arg_base_type [[META10]] !kernel_arg_type_qual [[META11]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i64 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1 +// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4 +// AMDGCN-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO]]([9 x i32] [[TMP3]]) #[[ATTR5]] +// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0 +// AMDGCN-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4 +// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false) +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @callee_kern_Mat32X32( +// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META8]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META11]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[TMP0]], ptr addrspace(5) [[IN_ADDR_I]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[TMP1]], ptr addrspace(5) [[OUT_ADDR_I]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR_I]], align 8 +// AMDGCN-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR_I]], align 8 +// AMDGCN-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP3]], i64 1 +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 4 [[ARRAYIDX1_I]], i64 4096, i1 false) +// AMDGCN-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_I]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR5]] +// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[TMP2]], ptr addrspace(5) align 4 [[TMP_I]], i64 16384, i1 false) +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: alwaysinline convergent norecurse nounwind +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_callee_kern_Mat32X32( +// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META8]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META11]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i64 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1 +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false) +// AMDGCN-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR5]] +// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false) +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMember( +// AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13:![0-9]+]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META14:![0-9]+]] !kernel_arg_base_type [[META14]] !kernel_arg_type_qual [[META7]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 +// AMDGCN-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8 +// AMDGCN-NEXT: store <2 x i32> [[TMP0]], ptr addrspace(5) [[U_I]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[U_I]], align 8 +// AMDGCN-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR5]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: alwaysinline convergent norecurse nounwind +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelOneMember( +// AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META14]] !kernel_arg_base_type [[META14]] !kernel_arg_type_qual [[META7]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 +// AMDGCN-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8 +// AMDGCN-NEXT: call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR5]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember( +// AMDGCN-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META7]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[U_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 +// AMDGCN-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U_I]], ptr addrspace(5) align 8 [[U]], i64 800, i1 false) +// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U_I]]) #[[ATTR5]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: alwaysinline convergent norecurse nounwind +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelLargeOneMember( +// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META15]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META7]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) +// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR5]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember( +// AMDGCN-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META7]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0 +// AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1 +// AMDGCN-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8 +// AMDGCN-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8 +// AMDGCN-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(5) [[U_I]], align 8 +// AMDGCN-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U_I]], i32 0, i32 1 +// AMDGCN-NEXT: store <2 x i32> [[TMP7]], ptr addrspace(5) [[TMP8]], align 8 +// AMDGCN-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr addrspace(5) [[U_I]], align 8 +// AMDGCN-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U_I]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP11:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP10]], align 8 +// AMDGCN-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP9]], <2 x i32> [[TMP11]]) #[[ATTR5]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: alwaysinline convergent norecurse nounwind +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelTwoMember( +// AMDGCN-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META16]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META7]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8 +// AMDGCN-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP3]], <2 x i32> [[TMP5]]) #[[ATTR5]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember( +// AMDGCN-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META7]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[U_I:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0 +// AMDGCN-NEXT: store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1 +// AMDGCN-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U_I]], ptr addrspace(5) align 8 [[U]], i64 480, i1 false) +// AMDGCN-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U_I]]) #[[ATTR5]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: alwaysinline convergent norecurse nounwind +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelLargeTwoMember( +// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META17]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META7]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false) +// AMDGCN-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR5]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @caller_kern( +// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[A:%.*]], ptr addrspace(1) noundef align 4 [[MAT3X3:%.*]], ptr addrspace(1) noundef align 4 [[MAT4X4:%.*]], ptr addrspace(1) noundef align 4 [[MAT32X32:%.*]], ptr addrspace(1) noundef align 4 [[MAT64X64:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META18:![0-9]+]] !kernel_arg_access_qual [[META19:![0-9]+]] !kernel_arg_type [[META20:![0-9]+]] !kernel_arg_base_type [[META20]] !kernel_arg_type_qual [[META21:![0-9]+]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[IN_ADDR_I1_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[OUT_ADDR_I2_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP_I3_I:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: [[BYVAL_TEMP_I_I:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: [[IN_ADDR_I_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[OUT_ADDR_I_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP_I_I:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: [[A_ADDR_I_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[A_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[MAT3X3_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[MAT4X4_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[MAT32X32_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[MAT64X64_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[MAT3X3_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[MAT4X4_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[MAT32X32_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[MAT64X64_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[MAT3X3]], ptr addrspace(5) [[MAT3X3_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[MAT4X4]], ptr addrspace(5) [[MAT4X4_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[MAT32X32]], ptr addrspace(5) [[MAT32X32_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[MAT64X64]], ptr addrspace(5) [[MAT64X64_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT3X3_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT4X4_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT32X32_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT64X64_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[TMP0]], ptr addrspace(5) [[A_ADDR_I]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[TMP1]], ptr addrspace(5) [[MAT3X3_ADDR_I]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[TMP2]], ptr addrspace(5) [[MAT4X4_ADDR_I]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[TMP3]], ptr addrspace(5) [[MAT32X32_ADDR_I]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[TMP4]], ptr addrspace(5) [[MAT64X64_ADDR_I]], align 8 +// AMDGCN-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR_I]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[TMP5]], ptr addrspace(5) [[A_ADDR_I_I]], align 8 +// AMDGCN-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR_I_I]], align 8 +// AMDGCN-NEXT: store i32 1, ptr addrspace(1) [[TMP6]], align 4 +// AMDGCN-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR_I]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern(ptr addrspace(1) noundef align 4 [[TMP7]]) #[[ATTR6:[0-9]+]] +// AMDGCN-NEXT: [[TMP8:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT3X3_ADDR_I]], align 8 +// AMDGCN-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT4X4_ADDR_I]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[TMP8]], ptr addrspace(5) [[IN_ADDR_I_I]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[TMP9]], ptr addrspace(5) [[OUT_ADDR_I_I]], align 8 +// AMDGCN-NEXT: [[TMP10:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR_I_I]], align 8 +// AMDGCN-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR_I_I]], align 8 +// AMDGCN-NEXT: [[ARRAYIDX1_I_I:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP11]], i64 1 +// AMDGCN-NEXT: [[TMP12:%.*]] = load [9 x i32], ptr addrspace(1) [[ARRAYIDX1_I_I]], align 4 +// AMDGCN-NEXT: [[CALL_I_I:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO]]([9 x i32] [[TMP12]]) #[[ATTR5]] +// AMDGCN-NEXT: [[TMP13:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL_I_I]], 0 +// AMDGCN-NEXT: store [16 x i32] [[TMP13]], ptr addrspace(5) [[TMP_I_I]], align 4 +// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[TMP10]], ptr addrspace(5) align 4 [[TMP_I_I]], i64 64, i1 false) +// AMDGCN-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT32X32_ADDR_I]], align 8 +// AMDGCN-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT64X64_ADDR_I]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[TMP14]], ptr addrspace(5) [[IN_ADDR_I1_I]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[TMP15]], ptr addrspace(5) [[OUT_ADDR_I2_I]], align 8 +// AMDGCN-NEXT: [[TMP16:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR_I2_I]], align 8 +// AMDGCN-NEXT: [[TMP17:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR_I1_I]], align 8 +// AMDGCN-NEXT: [[ARRAYIDX1_I4_I:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP17]], i64 1 +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP_I_I]], ptr addrspace(1) align 4 [[ARRAYIDX1_I4_I]], i64 4096, i1 false) +// AMDGCN-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_I3_I]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP_I_I]]) #[[ATTR5]] +// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[TMP16]], ptr addrspace(5) align 4 [[TMP_I3_I]], i64 16384, i1 false) +// AMDGCN-NEXT: [[TMP18:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT3X3_ADDR_I]], align 8 +// AMDGCN-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT4X4_ADDR_I]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern_Mat3X3(ptr addrspace(1) noundef align 4 [[TMP18]], ptr addrspace(1) noundef align 4 [[TMP19]]) #[[ATTR6]] +// AMDGCN-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT32X32_ADDR_I]], align 8 +// AMDGCN-NEXT: [[TMP21:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT64X64_ADDR_I]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern_Mat32X32(ptr addrspace(1) noundef align 4 [[TMP20]], ptr addrspace(1) noundef align 4 [[TMP21]]) #[[ATTR6]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: alwaysinline convergent norecurse nounwind +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_caller_kern( +// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[A:%.*]], ptr addrspace(1) noundef align 4 [[MAT3X3:%.*]], ptr addrspace(1) noundef align 4 [[MAT4X4:%.*]], ptr addrspace(1) noundef align 4 [[MAT32X32:%.*]], ptr addrspace(1) noundef align 4 [[MAT64X64:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META18]] !kernel_arg_access_qual [[META19]] !kernel_arg_type [[META20]] !kernel_arg_base_type [[META20]] !kernel_arg_type_qual [[META21]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[IN_ADDR_I1:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[OUT_ADDR_I2:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP_I3:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: [[A_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[MAT3X3_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[MAT4X4_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[MAT32X32_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[MAT64X64_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[MAT3X3]], ptr addrspace(5) [[MAT3X3_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[MAT4X4]], ptr addrspace(5) [[MAT4X4_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[MAT32X32]], ptr addrspace(5) [[MAT32X32_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[MAT64X64]], ptr addrspace(5) [[MAT64X64_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[TMP0]], ptr addrspace(5) [[A_ADDR_I]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR_I]], align 8 +// AMDGCN-NEXT: store i32 1, ptr addrspace(1) [[TMP1]], align 4 +// AMDGCN-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern(ptr addrspace(1) noundef align 4 [[TMP2]]) #[[ATTR6]] +// AMDGCN-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT3X3_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT4X4_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[TMP3]], ptr addrspace(5) [[IN_ADDR_I]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[TMP4]], ptr addrspace(5) [[OUT_ADDR_I]], align 8 +// AMDGCN-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR_I]], align 8 +// AMDGCN-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR_I]], align 8 +// AMDGCN-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP6]], i64 1 +// AMDGCN-NEXT: [[TMP7:%.*]] = load [9 x i32], ptr addrspace(1) [[ARRAYIDX1_I]], align 4 +// AMDGCN-NEXT: [[CALL_I:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO]]([9 x i32] [[TMP7]]) #[[ATTR5]] +// AMDGCN-NEXT: [[TMP8:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL_I]], 0 +// AMDGCN-NEXT: store [16 x i32] [[TMP8]], ptr addrspace(5) [[TMP_I]], align 4 +// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[TMP5]], ptr addrspace(5) align 4 [[TMP_I]], i64 64, i1 false) +// AMDGCN-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT32X32_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP10:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT64X64_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[TMP9]], ptr addrspace(5) [[IN_ADDR_I1]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[TMP10]], ptr addrspace(5) [[OUT_ADDR_I2]], align 8 +// AMDGCN-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR_I2]], align 8 +// AMDGCN-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR_I1]], align 8 +// AMDGCN-NEXT: [[ARRAYIDX1_I4:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP12]], i64 1 +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 4 [[ARRAYIDX1_I4]], i64 4096, i1 false) +// AMDGCN-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_I3]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR5]] +// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[TMP11]], ptr addrspace(5) align 4 [[TMP_I3]], i64 16384, i1 false) +// AMDGCN-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT3X3_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT4X4_ADDR]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern_Mat3X3(ptr addrspace(1) noundef align 4 [[TMP13]], ptr addrspace(1) noundef align 4 [[TMP14]]) #[[ATTR6]] +// AMDGCN-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT32X32_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP16:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT64X64_ADDR]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern_Mat32X32(ptr addrspace(1) noundef align 4 [[TMP15]], ptr addrspace(1) noundef align 4 [[TMP16]]) #[[ATTR6]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @caller_kern2( +// AMDGCN-SAME: <2 x i32> [[STRUCTONEMEM_COERCE:%.*]], ptr addrspace(1) noundef align 8 [[GLOBAL_STRUCTONEMEM:%.*]], [[STRUCT_STRUCTTWOMEMBER:%.*]] [[STRUCTTWOMEM_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META22:![0-9]+]] !kernel_arg_access_qual [[META23:![0-9]+]] !kernel_arg_type [[META24:![0-9]+]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25:![0-9]+]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[U_I3_I:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[U_I_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[STRUCTONEMEM_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[STRUCTTWOMEM_I:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[GLOBAL_STRUCTONEMEM_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[STRUCTONEMEM:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[STRUCTTWOMEM:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[GLOBAL_STRUCTONEMEM_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[STRUCTONEMEM]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[STRUCTONEMEM_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[STRUCTTWOMEM_COERCE]], 0 +// AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[STRUCTTWOMEM_COERCE]], 1 +// AMDGCN-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[GLOBAL_STRUCTONEMEM]], ptr addrspace(5) [[GLOBAL_STRUCTONEMEM_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[GLOBAL_STRUCTONEMEM_ADDR]], align 8 +// AMDGCN-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[STRUCTONEMEM]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8 +// AMDGCN-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8 +// AMDGCN-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP8]], align 8 +// AMDGCN-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(5) [[STRUCTONEMEM_I]], align 8 +// AMDGCN-NEXT: store <2 x i32> [[TMP7]], ptr addrspace(5) [[STRUCTTWOMEM_I]], align 8 +// AMDGCN-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM_I]], i32 0, i32 1 +// AMDGCN-NEXT: store <2 x i32> [[TMP9]], ptr addrspace(5) [[TMP10]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[TMP4]], ptr addrspace(5) [[GLOBAL_STRUCTONEMEM_ADDR_I]], align 8 +// AMDGCN-NEXT: [[TMP11:%.*]] = load <2 x i32>, ptr addrspace(5) [[STRUCTONEMEM_I]], align 8 +// AMDGCN-NEXT: store <2 x i32> [[TMP11]], ptr addrspace(5) [[U_I_I]], align 8 +// AMDGCN-NEXT: [[TMP12:%.*]] = load <2 x i32>, ptr addrspace(5) [[U_I_I]], align 8 +// AMDGCN-NEXT: call void @FuncOneMember(<2 x i32> [[TMP12]]) #[[ATTR5]] +// AMDGCN-NEXT: [[TMP13:%.*]] = load <2 x i32>, ptr addrspace(5) [[STRUCTONEMEM_I]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_KernelOneMember(<2 x i32> [[TMP13]]) #[[ATTR6]] +// AMDGCN-NEXT: [[TMP14:%.*]] = load <2 x i32>, ptr addrspace(5) [[STRUCTTWOMEM_I]], align 8 +// AMDGCN-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM_I]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP16:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP15]], align 8 +// AMDGCN-NEXT: store <2 x i32> [[TMP14]], ptr addrspace(5) [[U_I3_I]], align 8 +// AMDGCN-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U_I3_I]], i32 0, i32 1 +// AMDGCN-NEXT: store <2 x i32> [[TMP16]], ptr addrspace(5) [[TMP17]], align 8 +// AMDGCN-NEXT: [[TMP18:%.*]] = load <2 x i32>, ptr addrspace(5) [[U_I3_I]], align 8 +// AMDGCN-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U_I3_I]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP20:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP19]], align 8 +// AMDGCN-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP18]], <2 x i32> [[TMP20]]) #[[ATTR5]] +// AMDGCN-NEXT: [[TMP21:%.*]] = load <2 x i32>, ptr addrspace(5) [[STRUCTTWOMEM_I]], align 8 +// AMDGCN-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM_I]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP23:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP22]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_KernelTwoMember(<2 x i32> [[TMP21]], <2 x i32> [[TMP23]]) #[[ATTR6]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: alwaysinline convergent norecurse nounwind +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_caller_kern2( +// AMDGCN-SAME: <2 x i32> [[STRUCTONEMEM_COERCE:%.*]], ptr addrspace(1) noundef align 8 [[GLOBAL_STRUCTONEMEM:%.*]], <2 x i32> [[STRUCTTWOMEM_COERCE0:%.*]], <2 x i32> [[STRUCTTWOMEM_COERCE1:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META22]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META24]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[U_I3:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[STRUCTONEMEM:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[STRUCTTWOMEM:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[GLOBAL_STRUCTONEMEM_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[STRUCTONEMEM]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[STRUCTONEMEM_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[STRUCTTWOMEM_COERCE0]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 1 +// AMDGCN-NEXT: store <2 x i32> [[STRUCTTWOMEM_COERCE1]], ptr addrspace(5) [[TMP1]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[GLOBAL_STRUCTONEMEM]], ptr addrspace(5) [[GLOBAL_STRUCTONEMEM_ADDR]], align 8 +// AMDGCN-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[STRUCTONEMEM]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8 +// AMDGCN-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(5) [[U_I]], align 8 +// AMDGCN-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr addrspace(5) [[U_I]], align 8 +// AMDGCN-NEXT: call void @FuncOneMember(<2 x i32> [[TMP3]]) #[[ATTR5]] +// AMDGCN-NEXT: [[COERCE_DIVE2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[STRUCTONEMEM]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE2]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_KernelOneMember(<2 x i32> [[TMP4]]) #[[ATTR6]] +// AMDGCN-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP5]], align 8 +// AMDGCN-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP8:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP7]], align 8 +// AMDGCN-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(5) [[U_I3]], align 8 +// AMDGCN-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U_I3]], i32 0, i32 1 +// AMDGCN-NEXT: store <2 x i32> [[TMP8]], ptr addrspace(5) [[TMP9]], align 8 +// AMDGCN-NEXT: [[TMP10:%.*]] = load <2 x i32>, ptr addrspace(5) [[U_I3]], align 8 +// AMDGCN-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U_I3]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP12:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP11]], align 8 +// AMDGCN-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP10]], <2 x i32> [[TMP12]]) #[[ATTR5]] +// AMDGCN-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP14:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP13]], align 8 +// AMDGCN-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP16:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP15]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_KernelTwoMember(<2 x i32> [[TMP14]], <2 x i32> [[TMP16]]) #[[ATTR6]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @caller_kern3( +// AMDGCN-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[LARGESTRUCTONEMEM_COERCE:%.*]], [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[LARGESTRUCTTWOMEM_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META26:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META27:![0-9]+]] !kernel_arg_base_type [[META27]] !kernel_arg_type_qual [[META11]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[U_I2_I:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[U_I_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[LARGESTRUCTONEMEM_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[LARGESTRUCTTWOMEM_I:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[LARGESTRUCTONEMEM:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[LARGESTRUCTTWOMEM:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[LARGESTRUCTONEMEM]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[LARGESTRUCTONEMEM_COERCE]], 0 +// AMDGCN-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[LARGESTRUCTTWOMEM]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[LARGESTRUCTTWOMEM_COERCE]], 0 +// AMDGCN-NEXT: store [40 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[LARGESTRUCTTWOMEM]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[LARGESTRUCTTWOMEM_COERCE]], 1 +// AMDGCN-NEXT: store [20 x <2 x i32>] [[TMP5]], ptr addrspace(5) [[TMP4]], align 8 +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[LARGESTRUCTONEMEM_I]], ptr addrspace(5) align 8 [[LARGESTRUCTONEMEM]], i64 800, i1 false) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[LARGESTRUCTTWOMEM_I]], ptr addrspace(5) align 8 [[LARGESTRUCTTWOMEM]], i64 480, i1 false) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U_I_I]], ptr addrspace(5) align 8 [[LARGESTRUCTONEMEM_I]], i64 800, i1 false) +// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U_I_I]]) #[[ATTR5]] +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U_I2_I]], ptr addrspace(5) align 8 [[LARGESTRUCTTWOMEM_I]], i64 480, i1 false) +// AMDGCN-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U_I2_I]]) #[[ATTR5]] +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[LARGESTRUCTONEMEM_I]]) #[[ATTR6]] +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[LARGESTRUCTTWOMEM_I]]) #[[ATTR6]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: alwaysinline convergent norecurse nounwind +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_caller_kern3( +// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]], ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP1:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META26]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META27]] !kernel_arg_base_type [[META27]] !kernel_arg_type_qual [[META11]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[U_I2:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[U_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[LARGESTRUCTONEMEM:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[LARGESTRUCTTWOMEM:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[LARGESTRUCTONEMEM]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[LARGESTRUCTTWOMEM]], ptr addrspace(5) align 8 [[TMP1]], i64 480, i1 false) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U_I]], ptr addrspace(5) align 8 [[LARGESTRUCTONEMEM]], i64 800, i1 false) +// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U_I]]) #[[ATTR5]] +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U_I2]], ptr addrspace(5) align 8 [[LARGESTRUCTTWOMEM]], i64 480, i1 false) +// AMDGCN-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U_I2]]) #[[ATTR5]] +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[LARGESTRUCTONEMEM]]) #[[ATTR6]] +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[LARGESTRUCTTWOMEM]]) #[[ATTR6]] +// AMDGCN-NEXT: ret void +// +//. +// X86: [[META4]] = !{i32 1} +// X86: [[META5]] = !{!"none"} +// X86: [[META6]] = !{!"int*"} +// X86: [[META7]] = !{!""} +// X86: [[META8]] = !{i32 1, i32 1} +// X86: [[META9]] = !{!"none", !"none"} +// X86: [[META10]] = !{!"Mat3X3*", !"Mat4X4*"} +// X86: [[META11]] = !{!"", !""} +// X86: [[META12]] = !{!"Mat32X32*", !"Mat64X64*"} +// X86: [[META13]] = !{i32 0} +// X86: [[META14]] = !{!"struct StructOneMember"} +// X86: [[META15]] = !{!"struct LargeStructOneMember"} +// X86: [[META16]] = !{!"struct StructTwoMember"} +// X86: [[META17]] = !{!"struct LargeStructTwoMember"} +// X86: [[META18]] = !{i32 1, i32 1, i32 1, i32 1, i32 1} +// X86: [[META19]] = !{!"none", !"none", !"none", !"none", !"none"} +// X86: [[META20]] = !{!"int*", !"Mat3X3*", !"Mat4X4*", !"Mat32X32*", !"Mat64X64*"} +// X86: [[META21]] = !{!"", !"", !"", !"", !""} +// X86: [[META22]] = !{i32 0, i32 1, i32 0} +// X86: [[META23]] = !{!"none", !"none", !"none"} +// X86: [[META24]] = !{!"struct StructOneMember", !"struct StructOneMember*", !"struct StructTwoMember"} +// X86: [[META25]] = !{!"", !"", !""} +// X86: [[META26]] = !{i32 0, i32 0} +// X86: [[META27]] = !{!"struct LargeStructOneMember", !"struct LargeStructTwoMember"} //. -// CHECK: [[META4]] = !{i32 1} -// CHECK: [[META5]] = !{!"none"} -// CHECK: [[META6]] = !{!"int*"} -// CHECK: [[META7]] = !{!""} +// AMDGCN: [[META4]] = !{i32 1} +// AMDGCN: [[META5]] = !{!"none"} +// AMDGCN: [[META6]] = !{!"int*"} +// AMDGCN: [[META7]] = !{!""} +// AMDGCN: [[META8]] = !{i32 1, i32 1} +// AMDGCN: [[META9]] = !{!"none", !"none"} +// AMDGCN: [[META10]] = !{!"Mat3X3*", !"Mat4X4*"} +// AMDGCN: [[META11]] = !{!"", !""} +// AMDGCN: [[META12]] = !{!"Mat32X32*", !"Mat64X64*"} +// AMDGCN: [[META13]] = !{i32 0} +// AMDGCN: [[META14]] = !{!"struct StructOneMember"} +// AMDGCN: [[META15]] = !{!"struct LargeStructOneMember"} +// AMDGCN: [[META16]] = !{!"struct StructTwoMember"} +// AMDGCN: [[META17]] = !{!"struct LargeStructTwoMember"} +// AMDGCN: [[META18]] = !{i32 1, i32 1, i32 1, i32 1, i32 1} +// AMDGCN: [[META19]] = !{!"none", !"none", !"none", !"none", !"none"} +// AMDGCN: [[META20]] = !{!"int*", !"Mat3X3*", !"Mat4X4*", !"Mat32X32*", !"Mat64X64*"} +// AMDGCN: [[META21]] = !{!"", !"", !"", !"", !""} +// AMDGCN: [[META22]] = !{i32 0, i32 1, i32 0} +// AMDGCN: [[META23]] = !{!"none", !"none", !"none"} +// AMDGCN: [[META24]] = !{!"struct StructOneMember", !"struct StructOneMember*", !"struct StructTwoMember"} +// AMDGCN: [[META25]] = !{!"", !"", !""} +// AMDGCN: [[META26]] = !{i32 0, i32 0} +// AMDGCN: [[META27]] = !{!"struct LargeStructOneMember", !"struct LargeStructTwoMember"} //. From 799178f14528b21bb3420a550c5ac156e56e32f0 Mon Sep 17 00:00:00 2001 From: anikelal Date: Wed, 12 Mar 2025 17:46:44 +0530 Subject: [PATCH 5/8] Update tests --- .../CodeGenOpenCL/addr-space-struct-arg.cl | 12 +- .../amdgpu-abi-struct-arg-byref.cl | 12 +- .../CodeGenOpenCL/amdgpu-enqueue-kernel.cl | 134 ++++++++++-------- ...plicit-addrspacecast-function-parameter.cl | 71 ++++++---- 4 files changed, 123 insertions(+), 106 deletions(-) diff --git a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl index a92648e01db12..79befe7ef86a7 100644 --- a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl +++ b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl @@ -168,7 +168,6 @@ Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) { // AMDGCN20-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // AMDGCN20-NEXT: [[IN_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR_I]] to ptr // AMDGCN20-NEXT: [[OUT_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR_I]] to ptr -// AMDGCN20-NEXT: [[TMP_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[TMP_I]] to ptr // AMDGCN20-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[IN_ADDR_ASCAST_I]], align 8 // AMDGCN20-NEXT: store ptr addrspace(1) [[TMP1]], ptr [[OUT_ADDR_ASCAST_I]], align 8 // AMDGCN20-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST_I]], align 8 @@ -177,8 +176,8 @@ Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) { // AMDGCN20-NEXT: [[TMP4:%.*]] = load [9 x i32], ptr addrspace(1) [[ARRAYIDX1_I]], align 4 // AMDGCN20-NEXT: [[CALL_I:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP4]]) #[[ATTR4:[0-9]+]] // AMDGCN20-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL_I]], 0 -// AMDGCN20-NEXT: store [16 x i32] [[TMP5]], ptr [[TMP_ASCAST_I]], align 4 -// AMDGCN20-NEXT: call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 [[TMP2]], ptr align 4 [[TMP_ASCAST_I]], i64 64, i1 false) +// AMDGCN20-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP_I]], align 4 +// AMDGCN20-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[TMP2]], ptr addrspace(5) align 4 [[TMP_I]], i64 64, i1 false) // AMDGCN20-NEXT: ret void // // SPIR-LABEL: define dso_local spir_kernel void @ker( @@ -271,7 +270,7 @@ kernel void ker(global Mat3X3 *in, global Mat4X4 *out) { // AMDGCN-NEXT: ret void // // AMDGCN20-LABEL: define dso_local void @foo_large( -// AMDGCN20-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN20-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { // AMDGCN20-NEXT: [[ENTRY:.*:]] // AMDGCN20-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5) // AMDGCN20-NEXT: [[IN:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr @@ -365,15 +364,14 @@ Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) { // AMDGCN20-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // AMDGCN20-NEXT: [[IN_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR_I]] to ptr // AMDGCN20-NEXT: [[OUT_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR_I]] to ptr -// AMDGCN20-NEXT: [[TMP_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[TMP_I]] to ptr // AMDGCN20-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[IN_ADDR_ASCAST_I]], align 8 // AMDGCN20-NEXT: store ptr addrspace(1) [[TMP1]], ptr [[OUT_ADDR_ASCAST_I]], align 8 // AMDGCN20-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST_I]], align 8 // AMDGCN20-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST_I]], align 8 // AMDGCN20-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP3]], i64 1 // AMDGCN20-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 4 [[ARRAYIDX1_I]], i64 4096, i1 false) -// AMDGCN20-NEXT: call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_ASCAST_I]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4]] -// AMDGCN20-NEXT: call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 [[TMP2]], ptr align 4 [[TMP_ASCAST_I]], i64 16384, i1 false) +// AMDGCN20-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_I]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4]] +// AMDGCN20-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[TMP2]], ptr addrspace(5) align 4 [[TMP_I]], i64 16384, i1 false) // AMDGCN20-NEXT: ret void // // SPIR-LABEL: define dso_local spir_kernel void @ker_large( diff --git a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl index e25e060ea3ea9..3331559e3e05b 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl @@ -78,7 +78,6 @@ Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) { // AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // AMDGCN-NEXT: [[IN_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR_I]] to ptr // AMDGCN-NEXT: [[OUT_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR_I]] to ptr -// AMDGCN-NEXT: [[TMP_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[TMP_I]] to ptr // AMDGCN-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[IN_ADDR_ASCAST_I]], align 8 // AMDGCN-NEXT: store ptr addrspace(1) [[TMP1]], ptr [[OUT_ADDR_ASCAST_I]], align 8 // AMDGCN-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST_I]], align 8 @@ -87,8 +86,8 @@ Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) { // AMDGCN-NEXT: [[TMP4:%.*]] = load [9 x i32], ptr addrspace(1) [[ARRAYIDX1_I]], align 4 // AMDGCN-NEXT: [[CALL_I:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP4]]) #[[ATTR4:[0-9]+]] // AMDGCN-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL_I]], 0 -// AMDGCN-NEXT: store [16 x i32] [[TMP5]], ptr [[TMP_ASCAST_I]], align 4 -// AMDGCN-NEXT: call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 [[TMP2]], ptr align 4 [[TMP_ASCAST_I]], i64 64, i1 false) +// AMDGCN-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP_I]], align 4 +// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[TMP2]], ptr addrspace(5) align 4 [[TMP_I]], i64 64, i1 false) // AMDGCN-NEXT: ret void // kernel void ker(global Mat3X3 *in, global Mat4X4 *out) { @@ -96,7 +95,7 @@ kernel void ker(global Mat3X3 *in, global Mat4X4 *out) { } // AMDGCN-LABEL: define dso_local void @foo_large( -// AMDGCN-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { // AMDGCN-NEXT: [[ENTRY:.*:]] // AMDGCN-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5) // AMDGCN-NEXT: [[IN:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr @@ -125,15 +124,14 @@ Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) { // AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // AMDGCN-NEXT: [[IN_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR_I]] to ptr // AMDGCN-NEXT: [[OUT_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR_I]] to ptr -// AMDGCN-NEXT: [[TMP_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[TMP_I]] to ptr // AMDGCN-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[IN_ADDR_ASCAST_I]], align 8 // AMDGCN-NEXT: store ptr addrspace(1) [[TMP1]], ptr [[OUT_ADDR_ASCAST_I]], align 8 // AMDGCN-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST_I]], align 8 // AMDGCN-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST_I]], align 8 // AMDGCN-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP3]], i64 1 // AMDGCN-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 4 [[ARRAYIDX1_I]], i64 4096, i1 false) -// AMDGCN-NEXT: call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_ASCAST_I]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4]] -// AMDGCN-NEXT: call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 [[TMP2]], ptr align 4 [[TMP_ASCAST_I]], i64 16384, i1 false) +// AMDGCN-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_I]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4]] +// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[TMP2]], ptr addrspace(5) align 4 [[TMP_I]], i64 16384, i1 false) // AMDGCN-NEXT: ret void // kernel void ker_large(global Mat32X32 *in, global Mat64X64 *out) { diff --git a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl index 10ecfc0a1e6ba..1f62c6cce9a3c 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl @@ -61,6 +61,10 @@ kernel void test_target_features_kernel(global int *i) { } //. +// CHECK: @__test_block_invoke_kernel.runtime.handle = internal addrspace(1) externally_initialized constant %block.runtime.handle.t zeroinitializer, section ".amdgpu.kernel.runtime.handle" +// CHECK: @__test_block_invoke_2_kernel.runtime.handle = internal addrspace(1) externally_initialized constant %block.runtime.handle.t.0 zeroinitializer, section ".amdgpu.kernel.runtime.handle" +// CHECK: @__test_block_invoke_3_kernel.runtime.handle = internal addrspace(1) externally_initialized constant %block.runtime.handle.t.1 zeroinitializer, section ".amdgpu.kernel.runtime.handle" +// CHECK: @__test_block_invoke_4_kernel.runtime.handle = internal addrspace(1) externally_initialized constant %block.runtime.handle.t.2 zeroinitializer, section ".amdgpu.kernel.runtime.handle" // CHECK: @__block_literal_global = internal addrspace(1) constant { i32, i32, ptr } { i32 16, i32 8, ptr @__test_target_features_kernel_block_invoke }, align 8 #0 // CHECK: @__test_target_features_kernel_block_invoke_kernel.runtime.handle = internal addrspace(1) externally_initialized constant %block.runtime.handle.t.3 zeroinitializer, section ".amdgpu.kernel.runtime.handle" // CHECK: @llvm.used = appending addrspace(1) global [10 x ptr] [ptr @__test_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle to ptr), ptr @__test_block_invoke_2_kernel, ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle to ptr), ptr @__test_block_invoke_3_kernel, ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to ptr), ptr @__test_block_invoke_4_kernel, ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle to ptr), ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle to ptr)], section "llvm.metadata" @@ -165,7 +169,7 @@ kernel void test_target_features_kernel(global int *i) { // NOCPU-NEXT: [[BLOCK_CAPTURED1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 4 // NOCPU-NEXT: [[TMP3:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1 // NOCPU-NEXT: store i8 [[TMP3]], ptr [[BLOCK_CAPTURED1]], align 8 -// NOCPU-NEXT: [[TMP4:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr [[TMP_ASCAST]], ptr @__test_block_invoke_kernel, ptr [[BLOCK_ASCAST]]) +// NOCPU-NEXT: [[TMP4:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr addrspace(5) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle to ptr), ptr [[BLOCK_ASCAST]]) // NOCPU-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8 // NOCPU-NEXT: [[TMP6:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4 // NOCPU-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP2_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false) @@ -187,7 +191,7 @@ kernel void test_target_features_kernel(global int *i) { // NOCPU-NEXT: [[BLOCK_CAPTURED10:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 5 // NOCPU-NEXT: [[TMP10:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8 // NOCPU-NEXT: store i64 [[TMP10]], ptr [[BLOCK_CAPTURED10]], align 8 -// NOCPU-NEXT: [[TMP11:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP5]], i32 [[TMP6]], ptr [[TMP2_ASCAST]], ptr @__test_block_invoke_2_kernel, ptr [[BLOCK3_ASCAST]]) +// NOCPU-NEXT: [[TMP11:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP5]], i32 [[TMP6]], ptr addrspace(5) [[VARTMP2]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle to ptr), ptr [[BLOCK3_ASCAST]]) // NOCPU-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8 // NOCPU-NEXT: [[TMP13:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4 // NOCPU-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP11_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false) @@ -211,7 +215,7 @@ kernel void test_target_features_kernel(global int *i) { // NOCPU-NEXT: store i64 [[TMP17]], ptr [[BLOCK_CAPTURED19]], align 8 // NOCPU-NEXT: [[TMP18:%.*]] = getelementptr [1 x i64], ptr [[BLOCK_SIZES_ASCAST]], i32 0, i32 0 // NOCPU-NEXT: store i64 100, ptr [[TMP18]], align 8 -// NOCPU-NEXT: [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP12]], i32 [[TMP13]], ptr [[TMP11_ASCAST]], ptr @__test_block_invoke_3_kernel, ptr [[BLOCK12_ASCAST]], i32 1, ptr [[TMP18]]) +// NOCPU-NEXT: [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP12]], i32 [[TMP13]], ptr addrspace(5) [[VARTMP11]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to ptr), ptr [[BLOCK12_ASCAST]], i32 1, ptr [[TMP18]]) // NOCPU-NEXT: [[BLOCK_SIZE22:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 0 // NOCPU-NEXT: store i32 32, ptr [[BLOCK_SIZE22]], align 8 // NOCPU-NEXT: [[BLOCK_ALIGN23:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 1 @@ -229,7 +233,7 @@ kernel void test_target_features_kernel(global int *i) { // NOCPU-NEXT: [[TMP23:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4 // NOCPU-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP27_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false) // NOCPU-NEXT: [[TMP24:%.*]] = load ptr, ptr [[BLOCK20_ASCAST]], align 8 -// NOCPU-NEXT: [[TMP25:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP22]], i32 [[TMP23]], ptr [[TMP27_ASCAST]], ptr @__test_block_invoke_4_kernel, ptr [[BLOCK21_ASCAST]]) +// NOCPU-NEXT: [[TMP25:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP22]], i32 [[TMP23]], ptr addrspace(5) [[VARTMP27]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle to ptr), ptr [[BLOCK21_ASCAST]]) // NOCPU-NEXT: ret void // // @@ -265,7 +269,7 @@ kernel void test_target_features_kernel(global int *i) { // NOCPU-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8 // NOCPU-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4 // NOCPU-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false) -// NOCPU-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr [[TMP_ASCAST]], ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr)) +// NOCPU-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr addrspace(5) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle to ptr), ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr)) // NOCPU-NEXT: ret void // // @@ -290,7 +294,7 @@ kernel void test_target_features_kernel(global int *i) { // // NOCPU: Function Attrs: convergent nounwind // NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_kernel -// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0:%.*]]) #[[ATTR8:[0-9]+]] !kernel_arg_addr_space [[META11:![0-9]+]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META10]] { +// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0:%.*]]) #[[ATTR8:[0-9]+]] !associated [[META11:![0-9]+]] !kernel_arg_addr_space [[META12:![0-9]+]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META13]] !kernel_arg_type_qual [[META10]] { // NOCPU-NEXT: entry: // NOCPU-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), i8 }>, align 8, addrspace(5) // NOCPU-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 @@ -326,7 +330,7 @@ kernel void test_target_features_kernel(global int *i) { // // NOCPU: Function Attrs: convergent nounwind // NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_2_kernel -// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]]) #[[ATTR8]] !kernel_arg_addr_space [[META11]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META10]] { +// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]]) #[[ATTR8]] !associated [[META14:![0-9]+]] !kernel_arg_addr_space [[META12]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META13]] !kernel_arg_base_type [[META13]] !kernel_arg_type_qual [[META10]] { // NOCPU-NEXT: entry: // NOCPU-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5) // NOCPU-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 @@ -368,7 +372,7 @@ kernel void test_target_features_kernel(global int *i) { // // NOCPU: Function Attrs: convergent nounwind // NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_3_kernel -// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) #[[ATTR8]] !kernel_arg_addr_space [[META13:![0-9]+]] !kernel_arg_access_qual [[META14:![0-9]+]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META16:![0-9]+]] { +// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) #[[ATTR8]] !associated [[META15:![0-9]+]] !kernel_arg_addr_space [[META16:![0-9]+]] !kernel_arg_access_qual [[META17:![0-9]+]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META19:![0-9]+]] { // NOCPU-NEXT: entry: // NOCPU-NEXT: [[TMP2:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5) // NOCPU-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP2]], align 8 @@ -397,7 +401,7 @@ kernel void test_target_features_kernel(global int *i) { // // NOCPU: Function Attrs: convergent nounwind // NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_4_kernel -// NOCPU-SAME: (<{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0:%.*]]) #[[ATTR8]] !kernel_arg_addr_space [[META11]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META10]] { +// NOCPU-SAME: (<{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0:%.*]]) #[[ATTR8]] !associated [[META20:![0-9]+]] !kernel_arg_addr_space [[META12]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META13]] !kernel_arg_base_type [[META13]] !kernel_arg_type_qual [[META10]] { // NOCPU-NEXT: entry: // NOCPU-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, i64, ptr addrspace(1) }>, align 8, addrspace(5) // NOCPU-NEXT: store <{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 @@ -422,7 +426,7 @@ kernel void test_target_features_kernel(global int *i) { // // NOCPU: Function Attrs: convergent nounwind // NOCPU-LABEL: define {{[^@]+}}@__test_target_features_kernel_block_invoke_kernel -// NOCPU-SAME: ({ i32, i32, ptr } [[TMP0:%.*]]) #[[ATTR8]] !kernel_arg_addr_space [[META11]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META10]] { +// NOCPU-SAME: ({ i32, i32, ptr } [[TMP0:%.*]]) #[[ATTR8]] !associated [[META21:![0-9]+]] !kernel_arg_addr_space [[META12]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META13]] !kernel_arg_base_type [[META13]] !kernel_arg_type_qual [[META10]] { // NOCPU-NEXT: entry: // NOCPU-NEXT: [[TMP1:%.*]] = alloca { i32, i32, ptr }, align 8, addrspace(5) // NOCPU-NEXT: store { i32, i32, ptr } [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 @@ -484,7 +488,7 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: [[TMP1:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[TBAA16]] // GFX900-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[TBAA7]] // GFX900-NEXT: [[TMP3:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]] -// GFX900-NEXT: call void @__clang_ocl_kern_imp_test(ptr addrspace(1) noundef align 1 [[TMP0]], i8 noundef signext [[TMP1]], ptr addrspace(1) noundef align 8 [[TMP2]], i64 noundef [[TMP3]]) #[[ATTR9:[0-9]+]] +// GFX900-NEXT: call void @__clang_ocl_kern_imp_test(ptr addrspace(1) noundef align 1 [[TMP0]], i8 noundef signext [[TMP1]], ptr addrspace(1) noundef align 8 [[TMP2]], i64 noundef [[TMP3]]) #[[ATTR8:[0-9]+]] // GFX900-NEXT: ret void // // @@ -530,10 +534,10 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: store i8 [[B]], ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[TBAA16]] // GFX900-NEXT: store ptr addrspace(1) [[C]], ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[TBAA7]] // GFX900-NEXT: store i64 [[D]], ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]] -// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR10:[0-9]+]] -// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR10]] +// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR9:[0-9]+]] +// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR9]] // GFX900-NEXT: store i32 0, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17:![0-9]+]] -// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR10]] +// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR9]] // GFX900-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8, !tbaa [[TBAA19:![0-9]+]] // GFX900-NEXT: [[TMP1:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]] // GFX900-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21:![0-9]+]] @@ -549,7 +553,7 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: [[BLOCK_CAPTURED1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 4 // GFX900-NEXT: [[TMP3:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[TBAA16]] // GFX900-NEXT: store i8 [[TMP3]], ptr [[BLOCK_CAPTURED1]], align 8, !tbaa [[TBAA16]] -// GFX900-NEXT: [[TMP4:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr [[TMP_ASCAST]], ptr @__test_block_invoke_kernel, ptr [[BLOCK_ASCAST]]) +// GFX900-NEXT: [[TMP4:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr addrspace(5) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle to ptr), ptr [[BLOCK_ASCAST]]) // GFX900-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8, !tbaa [[TBAA19]] // GFX900-NEXT: [[TMP6:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]] // GFX900-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP2_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]] @@ -571,7 +575,7 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: [[BLOCK_CAPTURED10:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 5 // GFX900-NEXT: [[TMP10:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]] // GFX900-NEXT: store i64 [[TMP10]], ptr [[BLOCK_CAPTURED10]], align 8, !tbaa [[TBAA3]] -// GFX900-NEXT: [[TMP11:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP5]], i32 [[TMP6]], ptr [[TMP2_ASCAST]], ptr @__test_block_invoke_2_kernel, ptr [[BLOCK3_ASCAST]]) +// GFX900-NEXT: [[TMP11:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP5]], i32 [[TMP6]], ptr addrspace(5) [[VARTMP2]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle to ptr), ptr [[BLOCK3_ASCAST]]) // GFX900-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8, !tbaa [[TBAA19]] // GFX900-NEXT: [[TMP13:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]] // GFX900-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP11_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]] @@ -593,12 +597,12 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: [[BLOCK_CAPTURED19:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 5 // GFX900-NEXT: [[TMP17:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]] // GFX900-NEXT: store i64 [[TMP17]], ptr [[BLOCK_CAPTURED19]], align 8, !tbaa [[TBAA3]] -// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR10]] +// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR9]] // GFX900-NEXT: [[TMP18:%.*]] = getelementptr [1 x i64], ptr [[BLOCK_SIZES_ASCAST]], i32 0, i32 0 // GFX900-NEXT: store i64 100, ptr [[TMP18]], align 8 -// GFX900-NEXT: [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP12]], i32 [[TMP13]], ptr [[TMP11_ASCAST]], ptr @__test_block_invoke_3_kernel, ptr [[BLOCK12_ASCAST]], i32 1, ptr [[TMP18]]) -// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR10]] -// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK20]]) #[[ATTR10]] +// GFX900-NEXT: [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP12]], i32 [[TMP13]], ptr addrspace(5) [[VARTMP11]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to ptr), ptr [[BLOCK12_ASCAST]], i32 1, ptr [[TMP18]]) +// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR9]] +// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK20]]) #[[ATTR9]] // GFX900-NEXT: [[BLOCK_SIZE22:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 0 // GFX900-NEXT: store i32 32, ptr [[BLOCK_SIZE22]], align 8 // GFX900-NEXT: [[BLOCK_ALIGN23:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 1 @@ -616,11 +620,11 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: [[TMP23:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]] // GFX900-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP27_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]] // GFX900-NEXT: [[TMP24:%.*]] = load ptr, ptr [[BLOCK20_ASCAST]], align 8, !tbaa [[TBAA16]] -// GFX900-NEXT: [[TMP25:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP22]], i32 [[TMP23]], ptr [[TMP27_ASCAST]], ptr @__test_block_invoke_4_kernel, ptr [[BLOCK21_ASCAST]]) -// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[BLOCK20]]) #[[ATTR10]] -// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR10]] -// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR10]] -// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR10]] +// GFX900-NEXT: [[TMP25:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP22]], i32 [[TMP23]], ptr addrspace(5) [[VARTMP27]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle to ptr), ptr [[BLOCK21_ASCAST]]) +// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[BLOCK20]]) #[[ATTR9]] +// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR9]] +// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR9]] +// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR9]] // GFX900-NEXT: ret void // // @@ -632,7 +636,7 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr // GFX900-NEXT: store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align 8, !tbaa [[TBAA26:![0-9]+]] // GFX900-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[I_ADDR_ASCAST]], align 8, !tbaa [[TBAA26]] -// GFX900-NEXT: call void @__clang_ocl_kern_imp_test_target_features_kernel(ptr addrspace(1) noundef align 4 [[TMP0]]) #[[ATTR9]] +// GFX900-NEXT: call void @__clang_ocl_kern_imp_test_target_features_kernel(ptr addrspace(1) noundef align 4 [[TMP0]]) #[[ATTR8]] // GFX900-NEXT: ret void // // @@ -651,18 +655,18 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: [[NDRANGE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[NDRANGE]] to ptr // GFX900-NEXT: [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr // GFX900-NEXT: store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align 8, !tbaa [[TBAA26]] -// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR10]] -// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR10]] +// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR9]] +// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR9]] // GFX900-NEXT: store i32 0, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]] -// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR10]] +// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR9]] // GFX900-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime() // GFX900-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8, !tbaa [[TBAA19]] // GFX900-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]] // GFX900-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]] -// GFX900-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr [[TMP_ASCAST]], ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr)) -// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR10]] -// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR10]] -// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR10]] +// GFX900-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr addrspace(5) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle to ptr), ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr)) +// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR9]] +// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR9]] +// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR9]] // GFX900-NEXT: ret void // // @@ -684,7 +688,7 @@ kernel void test_target_features_kernel(global int *i) { // // GFX900: Function Attrs: convergent nounwind // GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_kernel -// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0:%.*]]) #[[ATTR7:[0-9]+]] !kernel_arg_addr_space [[META28:![0-9]+]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META29:![0-9]+]] !kernel_arg_base_type [[META29]] !kernel_arg_type_qual [[META25]] { +// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0:%.*]]) #[[ATTR6]] !associated [[META28:![0-9]+]] !kernel_arg_addr_space [[META29:![0-9]+]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META30:![0-9]+]] !kernel_arg_base_type [[META30]] !kernel_arg_type_qual [[META25]] { // GFX900-NEXT: entry: // GFX900-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), i8 }>, align 8, addrspace(5) // GFX900-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 @@ -717,7 +721,7 @@ kernel void test_target_features_kernel(global int *i) { // // GFX900: Function Attrs: convergent nounwind // GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_2_kernel -// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]]) #[[ATTR7]] !kernel_arg_addr_space [[META28]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META29]] !kernel_arg_base_type [[META29]] !kernel_arg_type_qual [[META25]] { +// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]]) #[[ATTR6]] !associated [[META31:![0-9]+]] !kernel_arg_addr_space [[META29]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META30]] !kernel_arg_base_type [[META30]] !kernel_arg_type_qual [[META25]] { // GFX900-NEXT: entry: // GFX900-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5) // GFX900-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 @@ -735,7 +739,7 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr // GFX900-NEXT: [[LP_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[LP_ADDR]] to ptr // GFX900-NEXT: store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8 -// GFX900-NEXT: store ptr addrspace(3) [[LP]], ptr [[LP_ADDR_ASCAST]], align 4, !tbaa [[TBAA30:![0-9]+]] +// GFX900-NEXT: store ptr addrspace(3) [[LP]], ptr [[LP_ADDR_ASCAST]], align 4, !tbaa [[TBAA32:![0-9]+]] // GFX900-NEXT: [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 6 // GFX900-NEXT: [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[TBAA16]] // GFX900-NEXT: [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3 @@ -748,7 +752,7 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR3]], align 8, !tbaa [[TBAA7]] // GFX900-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[TMP3]], i64 0 // GFX900-NEXT: store i64 [[TMP2]], ptr addrspace(1) [[ARRAYIDX4]], align 8, !tbaa [[TBAA3]] -// GFX900-NEXT: [[TMP4:%.*]] = load ptr addrspace(3), ptr [[LP_ADDR_ASCAST]], align 4, !tbaa [[TBAA30]] +// GFX900-NEXT: [[TMP4:%.*]] = load ptr addrspace(3), ptr [[LP_ADDR_ASCAST]], align 4, !tbaa [[TBAA32]] // GFX900-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[TMP4]], i64 0 // GFX900-NEXT: store i32 1, ptr addrspace(3) [[ARRAYIDX5]], align 4, !tbaa [[TBAA17]] // GFX900-NEXT: ret void @@ -756,7 +760,7 @@ kernel void test_target_features_kernel(global int *i) { // // GFX900: Function Attrs: convergent nounwind // GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_3_kernel -// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) #[[ATTR7]] !kernel_arg_addr_space [[META32:![0-9]+]] !kernel_arg_access_qual [[META33:![0-9]+]] !kernel_arg_type [[META34:![0-9]+]] !kernel_arg_base_type [[META34]] !kernel_arg_type_qual [[META35:![0-9]+]] { +// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) #[[ATTR6]] !associated [[META33:![0-9]+]] !kernel_arg_addr_space [[META34:![0-9]+]] !kernel_arg_access_qual [[META35:![0-9]+]] !kernel_arg_type [[META36:![0-9]+]] !kernel_arg_base_type [[META36]] !kernel_arg_type_qual [[META37:![0-9]+]] { // GFX900-NEXT: entry: // GFX900-NEXT: [[TMP2:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5) // GFX900-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP2]], align 8 @@ -776,13 +780,13 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: [[TMP0:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[TBAA3]] // GFX900-NEXT: [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4 // GFX900-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8, !tbaa [[TBAA7]] -// GFX900-NEXT: call void @callee(i64 noundef [[TMP0]], ptr addrspace(1) noundef [[TMP1]]) #[[ATTR11:[0-9]+]] +// GFX900-NEXT: call void @callee(i64 noundef [[TMP0]], ptr addrspace(1) noundef [[TMP1]]) #[[ATTR10:[0-9]+]] // GFX900-NEXT: ret void // // // GFX900: Function Attrs: convergent nounwind // GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_4_kernel -// GFX900-SAME: (<{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0:%.*]]) #[[ATTR7]] !kernel_arg_addr_space [[META28]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META29]] !kernel_arg_base_type [[META29]] !kernel_arg_type_qual [[META25]] { +// GFX900-SAME: (<{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0:%.*]]) #[[ATTR6]] !associated [[META38:![0-9]+]] !kernel_arg_addr_space [[META29]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META30]] !kernel_arg_base_type [[META30]] !kernel_arg_type_qual [[META25]] { // GFX900-NEXT: entry: // GFX900-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, i64, ptr addrspace(1) }>, align 8, addrspace(5) // GFX900-NEXT: store <{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 @@ -804,7 +808,7 @@ kernel void test_target_features_kernel(global int *i) { // // GFX900: Function Attrs: convergent nounwind // GFX900-LABEL: define {{[^@]+}}@__test_target_features_kernel_block_invoke_kernel -// GFX900-SAME: ({ i32, i32, ptr } [[TMP0:%.*]]) #[[ATTR7]] !kernel_arg_addr_space [[META28]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META29]] !kernel_arg_base_type [[META29]] !kernel_arg_type_qual [[META25]] { +// GFX900-SAME: ({ i32, i32, ptr } [[TMP0:%.*]]) #[[ATTR6]] !associated [[META39:![0-9]+]] !kernel_arg_addr_space [[META29]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META30]] !kernel_arg_base_type [[META30]] !kernel_arg_type_qual [[META25]] { // GFX900-NEXT: entry: // GFX900-NEXT: [[TMP1:%.*]] = alloca { i32, i32, ptr }, align 8, addrspace(5) // GFX900-NEXT: store { i32, i32, ptr } [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 @@ -821,7 +825,7 @@ kernel void test_target_features_kernel(global int *i) { // NOCPU: attributes #[[ATTR5]] = { alwaysinline convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+s-memtime-inst" "uniform-work-group-size"="false" } // NOCPU: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } // NOCPU: attributes #[[ATTR7]] = { convergent noinline nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } -// NOCPU: attributes #[[ATTR8]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "enqueued-block" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// NOCPU: attributes #[[ATTR8]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } // NOCPU: attributes #[[ATTR9:[0-9]+]] = { nocallback nofree nosync nounwind willreturn } // NOCPU: attributes #[[ATTR10]] = { convergent nounwind "uniform-work-group-size"="false" } // NOCPU: attributes #[[ATTR11]] = { convergent nounwind } @@ -833,11 +837,10 @@ kernel void test_target_features_kernel(global int *i) { // GFX900: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } // GFX900: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } // GFX900: attributes #[[ATTR6]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" } -// GFX900: attributes #[[ATTR7]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "enqueued-block" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" } -// GFX900: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nosync nounwind willreturn } -// GFX900: attributes #[[ATTR9]] = { convergent nounwind "uniform-work-group-size"="false" } -// GFX900: attributes #[[ATTR10]] = { nounwind } -// GFX900: attributes #[[ATTR11]] = { convergent nounwind } +// GFX900: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind willreturn } +// GFX900: attributes #[[ATTR8]] = { convergent nounwind "uniform-work-group-size"="false" } +// GFX900: attributes #[[ATTR9]] = { nounwind } +// GFX900: attributes #[[ATTR10]] = { convergent nounwind } //. // NOCPU: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} // NOCPU: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} @@ -850,12 +853,17 @@ kernel void test_target_features_kernel(global int *i) { // NOCPU: [[META8]] = !{!"none"} // NOCPU: [[META9]] = !{!"int*"} // NOCPU: [[META10]] = !{!""} -// NOCPU: [[META11]] = !{i32 0} -// NOCPU: [[META12]] = !{!"__block_literal"} -// NOCPU: [[META13]] = !{i32 0, i32 3} -// NOCPU: [[META14]] = !{!"none", !"none"} -// NOCPU: [[META15]] = !{!"__block_literal", !"void*"} -// NOCPU: [[META16]] = !{!"", !""} +// NOCPU: [[META11]] = !{ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle} +// NOCPU: [[META12]] = !{i32 0} +// NOCPU: [[META13]] = !{!"__block_literal"} +// NOCPU: [[META14]] = !{ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle} +// NOCPU: [[META15]] = !{ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle} +// NOCPU: [[META16]] = !{i32 0, i32 3} +// NOCPU: [[META17]] = !{!"none", !"none"} +// NOCPU: [[META18]] = !{!"__block_literal", !"void*"} +// NOCPU: [[META19]] = !{!"", !""} +// NOCPU: [[META20]] = !{ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle} +// NOCPU: [[META21]] = !{ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle} //. // GFX900: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} // GFX900: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} @@ -885,14 +893,18 @@ kernel void test_target_features_kernel(global int *i) { // GFX900: [[META25]] = !{!""} // GFX900: [[TBAA26]] = !{[[META27:![0-9]+]], [[META27]], i64 0} // GFX900: [[META27]] = !{!"p1 int", [[META9]], i64 0} -// GFX900: [[META28]] = !{i32 0} -// GFX900: [[META29]] = !{!"__block_literal"} -// GFX900: [[TBAA30]] = !{[[META31:![0-9]+]], [[META31]], i64 0} -// GFX900: [[META31]] = !{!"p1 void", [[META9]], i64 0} -// GFX900: [[META32]] = !{i32 0, i32 3} -// GFX900: [[META33]] = !{!"none", !"none"} -// GFX900: [[META34]] = !{!"__block_literal", !"void*"} -// GFX900: [[META35]] = !{!"", !""} +// GFX900: [[META28]] = !{ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle} +// GFX900: [[META29]] = !{i32 0} +// GFX900: [[META30]] = !{!"__block_literal"} +// GFX900: [[META31]] = !{ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle} +// GFX900: [[TBAA32]] = !{[[META9]], [[META9]], i64 0} +// GFX900: [[META33]] = !{ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle} +// GFX900: [[META34]] = !{i32 0, i32 3} +// GFX900: [[META35]] = !{!"none", !"none"} +// GFX900: [[META36]] = !{!"__block_literal", !"void*"} +// GFX900: [[META37]] = !{!"", !""} +// GFX900: [[META38]] = !{ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle} +// GFX900: [[META39]] = !{ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle} //. //// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: // CHECK: {{.*}} diff --git a/clang/test/CodeGenOpenCL/implicit-addrspacecast-function-parameter.cl b/clang/test/CodeGenOpenCL/implicit-addrspacecast-function-parameter.cl index 4a7bb8227c339..ae560898c2be7 100644 --- a/clang/test/CodeGenOpenCL/implicit-addrspacecast-function-parameter.cl +++ b/clang/test/CodeGenOpenCL/implicit-addrspacecast-function-parameter.cl @@ -1,4 +1,4 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 5 // RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s // Check there's no assertion when passing a pointer to an address space @@ -8,57 +8,66 @@ extern void private_ptr(__private int *); extern void local_ptr(__local int *); extern void generic_ptr(__generic int *); +void use_of_private_var() +{ + int x = 0 ; + private_ptr(&x); + generic_ptr(&x); +} + +void addr_of_arg(int x) +{ + private_ptr(&x); + generic_ptr(&x); +} + +__kernel void use_of_local_var() +{ + __local int x; + local_ptr(&x); + generic_ptr(&x); +} + // CHECK-LABEL: define dso_local void @use_of_private_var( // CHECK-SAME: ) #[[ATTR0:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[X:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr -// CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[X]]) #[[ATTR4:[0-9]+]] +// CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[X]]) #[[ATTR5:[0-9]+]] // CHECK-NEXT: store i32 0, ptr [[X_ASCAST]], align 4, !tbaa [[TBAA4:![0-9]+]] -// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[X_ASCAST]] to ptr addrspace(5) -// CHECK-NEXT: call void @private_ptr(ptr addrspace(5) noundef [[TMP0]]) #[[ATTR5:[0-9]+]] -// CHECK-NEXT: call void @generic_ptr(ptr noundef [[X_ASCAST]]) #[[ATTR5]] -// CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[X]]) #[[ATTR4]] +// CHECK-NEXT: [[X_ASCAST_ASCAST:%.*]] = addrspacecast ptr [[X_ASCAST]] to ptr addrspace(5) +// CHECK-NEXT: call void @private_ptr(ptr addrspace(5) noundef [[X_ASCAST_ASCAST]]) #[[ATTR6:[0-9]+]] +// CHECK-NEXT: call void @generic_ptr(ptr noundef [[X_ASCAST]]) #[[ATTR6]] +// CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[X]]) #[[ATTR5]] // CHECK-NEXT: ret void // -void use_of_private_var() -{ - int x = 0 ; - private_ptr(&x); - generic_ptr(&x); -} - +// // CHECK-LABEL: define dso_local void @addr_of_arg( // CHECK-SAME: i32 noundef [[X:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[X_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr // CHECK-NEXT: store i32 [[X]], ptr [[X_ADDR_ASCAST]], align 4, !tbaa [[TBAA4]] -// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[X_ADDR_ASCAST]] to ptr addrspace(5) -// CHECK-NEXT: call void @private_ptr(ptr addrspace(5) noundef [[TMP0]]) #[[ATTR5]] -// CHECK-NEXT: call void @generic_ptr(ptr noundef [[X_ADDR_ASCAST]]) #[[ATTR5]] +// CHECK-NEXT: [[X_ADDR_ASCAST_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR_ASCAST]] to ptr addrspace(5) +// CHECK-NEXT: call void @private_ptr(ptr addrspace(5) noundef [[X_ADDR_ASCAST_ASCAST]]) #[[ATTR6]] +// CHECK-NEXT: call void @generic_ptr(ptr noundef [[X_ADDR_ASCAST]]) #[[ATTR6]] // CHECK-NEXT: ret void // -void addr_of_arg(int x) -{ - private_ptr(&x); - generic_ptr(&x); -} - +// // CHECK-LABEL: define dso_local amdgpu_kernel void @use_of_local_var( // CHECK-SAME: ) #[[ATTR3:[0-9]+]] !kernel_arg_addr_space [[META8:![0-9]+]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META8]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: call void @local_ptr(ptr addrspace(3) noundef @use_of_local_var.x) #[[ATTR5]] -// CHECK-NEXT: call void @generic_ptr(ptr noundef addrspacecast (ptr addrspace(3) @use_of_local_var.x to ptr)) #[[ATTR5]] +// CHECK-NEXT: call void @__clang_ocl_kern_imp_use_of_local_var() #[[ATTR7:[0-9]+]] +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define dso_local void @__clang_ocl_kern_imp_use_of_local_var( +// CHECK-SAME: ) #[[ATTR4:[0-9]+]] !kernel_arg_addr_space [[META8]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META8]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @local_ptr(ptr addrspace(3) noundef @use_of_local_var.x) #[[ATTR6]] +// CHECK-NEXT: call void @generic_ptr(ptr noundef addrspacecast (ptr addrspace(3) @use_of_local_var.x to ptr)) #[[ATTR6]] // CHECK-NEXT: ret void // -__kernel void use_of_local_var() -{ - __local int x; - local_ptr(&x); - generic_ptr(&x); -} - //. // CHECK: [[TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0} // CHECK: [[META5]] = !{!"int", [[META6:![0-9]+]], i64 0} From 721cfbba29a8d516ec7c76446f0af075d73f1f62 Mon Sep 17 00:00:00 2001 From: anikelal Date: Thu, 13 Mar 2025 15:27:17 +0530 Subject: [PATCH 6/8] Changing dyn_cast and assert to cast --- clang/lib/CodeGen/CGCall.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index 6f5ccaa2f607d..dd4fcfed56cd1 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -500,8 +500,7 @@ CodeGenTypes::arrangeCXXConstructorCall(const CallArgList &args, /// definition of the given function. const CGFunctionInfo & CodeGenTypes::arrangeFunctionDeclaration(const GlobalDecl GD) { - const FunctionDecl *FD = dyn_cast(GD.getDecl()); - assert(FD && "GD must contain FunctionDecl"); + const FunctionDecl *FD = cast(GD.getDecl()); if (const CXXMethodDecl *MD = dyn_cast(FD)) if (MD->isImplicitObjectMemberFunction()) return arrangeCXXMethodDeclaration(MD); From 65811587f2f9fa2800607e2824877cd2f90c167f Mon Sep 17 00:00:00 2001 From: anikelal Date: Wed, 19 Mar 2025 12:01:42 +0530 Subject: [PATCH 7/8] Removed always-inline attribute from stubs and made required littest changes --- clang/lib/AST/MicrosoftMangle.cpp | 2 +- clang/lib/CodeGen/CGCall.cpp | 7 +- clang/lib/CodeGen/CGExpr.cpp | 8 +- clang/lib/CodeGen/CodeGenModule.cpp | 10 - .../CodeGenOpenCL/addr-space-struct-arg.cl | 2691 ++++++++++------- .../amdgpu-abi-struct-arg-byref.cl | 356 ++- .../CodeGenOpenCL/amdgpu-enqueue-kernel.cl | 206 +- .../test/CodeGenOpenCL/cl-uniform-wg-size.cl | 6 + .../cl20-device-side-enqueue-attributes.cl | 142 +- .../CodeGenOpenCL/cl20-device-side-enqueue.cl | 24 +- clang/test/CodeGenOpenCL/convergent.cl | 9 +- .../enqueue-kernel-non-entry-block.cl | 11 +- ...plicit-addrspacecast-function-parameter.cl | 2 +- .../test/CodeGenOpenCL/opencl-kernel-call.cl | 565 +--- clang/test/CodeGenOpenCL/reflect.cl | 36 +- clang/test/CodeGenOpenCL/sampler.cl | 4 + clang/test/CodeGenOpenCL/spir-calling-conv.cl | 5 +- .../amdgcn-machine-analysis-remarks.cl | 2 +- 18 files changed, 2124 insertions(+), 1962 deletions(-) diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp index 63291b23f10f4..4d14614fc1ec7 100644 --- a/clang/lib/AST/MicrosoftMangle.cpp +++ b/clang/lib/AST/MicrosoftMangle.cpp @@ -1163,7 +1163,7 @@ void MicrosoftCXXNameMangler::mangleUnqualifiedName(GlobalDecl GD, ->hasAttr())) && GD.getKernelReferenceKind() == KernelReferenceKind::Stub; bool IsOCLDeviceStub = - ND && (isa(ND) && ND->hasAttr()) && + ND && isa(ND) && ND->hasAttr() && GD.getKernelReferenceKind() == KernelReferenceKind::Stub; if (IsDeviceStub) mangleSourceName( diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index dd4fcfed56cd1..ab200a63194f5 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -2503,7 +2503,12 @@ void CodeGenModule::ConstructAttributeList(StringRef Name, NumElemsParam); } - if (TargetDecl->hasAttr()) { + if (TargetDecl->hasAttr() && + CallingConv != CallingConv::CC_C && + CallingConv != + CallingConv::CC_SpirFunction) { // Check CallingConv to avoid adding + // uniform-work-group-size attribute + // to OpenCL Kernel Stub if (getLangOpts().OpenCLVersion <= 120) { // OpenCL v1.2 Work groups are always uniform FuncAttrs.addAttribute("uniform-work-group-size", "true"); diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index e39b5aad3bacf..de7c577a23493 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -6140,11 +6140,9 @@ RValue CodeGenFunction::EmitCall(QualType CalleeType, const auto *FnType = cast(PointeeType); - if (auto FD = dyn_cast_or_null(TargetDecl)) { - if (FD->hasAttr()) { - CGM.getTargetCodeGenInfo().setOCLKernelStubCallingConvention(FnType); - } - } + if (const auto *FD = dyn_cast_or_null(TargetDecl); + FD && FD->hasAttr()) + CGM.getTargetCodeGenInfo().setOCLKernelStubCallingConvention(FnType); // If we are checking indirect calls and this call is indirect, check that the // function pointer is a member of the bit set for the function type. diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 8c0a2896a894f..3a3e9ac77670c 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -6149,16 +6149,6 @@ void CodeGenModule::EmitGlobalFunctionDefinition(GlobalDecl GD, setNonAliasAttributes(GD, Fn); - if (D->hasAttr()) { - if (GD.getKernelReferenceKind() == KernelReferenceKind::Stub) { - if (Fn->hasFnAttribute(llvm::Attribute::NoInline)) - Fn->removeFnAttr(llvm::Attribute::NoInline); - if (Fn->hasFnAttribute(llvm::Attribute::InlineHint)) - Fn->removeFnAttr(llvm::Attribute::InlineHint); - Fn->addFnAttr(llvm::Attribute::AlwaysInline); - } - } - SetLLVMFunctionAttributesForDefinition(D, Fn); if (const ConstructorAttr *CA = D->getAttr()) diff --git a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl index 79befe7ef86a7..789aae7a5c34c 100644 --- a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl +++ b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl @@ -1,4 +1,4 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 5 // RUN: %clang_cc1 %s -emit-llvm -o - -O0 -ffake-address-space-map -triple i686-pc-darwin | FileCheck -check-prefixes=X86 %s // RUN: %clang_cc1 %s -emit-llvm -o - -O0 -triple amdgcn | FileCheck -check-prefixes=AMDGCN %s // RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL2.0 -O0 -triple amdgcn | FileCheck -check-prefixes=AMDGCN20 %s @@ -46,6 +46,77 @@ struct LargeStructTwoMember { struct LargeStructOneMember g_s; #endif +Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) { + Mat4X4 out; + return out; +} + +kernel void ker(global Mat3X3 *in, global Mat4X4 *out) { + out[0] = foo(in[1]); +} + +Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) { + Mat64X64 out; + return out; +} + +kernel void ker_large(global Mat32X32 *in, global Mat64X64 *out) { + out[0] = foo_large(in[1]); +} + +void FuncOneMember(struct StructOneMember u) { + u.x = (int2)(0, 0); +} + +void FuncOneLargeMember(struct LargeStructOneMember u) { + u.x[0] = (int2)(0, 0); +} + +#if (__OPENCL_C_VERSION__ == 200) || (__OPENCL_C_VERSION__ >= 300 && defined(__opencl_c_program_scope_global_variables)) + +void test_indirect_arg_globl(void) { + FuncOneLargeMember(g_s); +} +#endif + +kernel void test_indirect_arg_local(void) { + local struct LargeStructOneMember l_s; + FuncOneLargeMember(l_s); +} + +void test_indirect_arg_private(void) { + struct LargeStructOneMember p_s; + FuncOneLargeMember(p_s); +} + +kernel void KernelOneMember(struct StructOneMember u) { + FuncOneMember(u); +} + +kernel void KernelOneMemberSpir(global struct StructOneMember* u) { + FuncOneMember(*u); +} + +kernel void KernelLargeOneMember(struct LargeStructOneMember u) { + FuncOneLargeMember(u); +} + +void FuncTwoMember(struct StructTwoMember u) { + u.y = (int2)(0, 0); +} + +void FuncLargeTwoMember(struct LargeStructTwoMember u) { + u.y[0] = (int2)(0, 0); +} + +kernel void KernelTwoMember(struct StructTwoMember u) { + FuncTwoMember(u); +} + +kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { + FuncLargeTwoMember(u); +} + // X86-LABEL: define void @foo( // X86-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT4X4:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_MAT3X3:%.*]]) align 4 [[IN:%.*]]) #[[ATTR0:[0-9]+]] { // X86-NEXT: [[ENTRY:.*:]] @@ -53,208 +124,39 @@ struct LargeStructOneMember g_s; // X86-NEXT: store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4 // X86-NEXT: ret void // -// AMDGCN-LABEL: define dso_local %struct.Mat4X4 @foo( -// AMDGCN-SAME: [9 x i32] [[IN_COERCE:%.*]]) #[[ATTR0:[0-9]+]] { -// AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) -// AMDGCN-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5) -// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(5) [[IN]], i32 0, i32 0 -// AMDGCN-NEXT: store [9 x i32] [[IN_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 4 -// AMDGCN-NEXT: [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr addrspace(5) [[RETVAL]], align 4 -// AMDGCN-NEXT: ret [[STRUCT_MAT4X4]] [[TMP0]] -// -// AMDGCN20-LABEL: define dso_local %struct.Mat4X4 @foo( -// AMDGCN20-SAME: [9 x i32] [[IN_COERCE:%.*]]) #[[ATTR0:[0-9]+]] { -// AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) -// AMDGCN20-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5) -// AMDGCN20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr -// AMDGCN20-NEXT: [[IN1:%.*]] = addrspacecast ptr addrspace(5) [[IN]] to ptr -// AMDGCN20-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr [[IN1]], i32 0, i32 0 -// AMDGCN20-NEXT: store [9 x i32] [[IN_COERCE]], ptr [[COERCE_DIVE]], align 4 -// AMDGCN20-NEXT: [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr [[RETVAL_ASCAST]], align 4 -// AMDGCN20-NEXT: ret [[STRUCT_MAT4X4]] [[TMP0]] -// -// SPIR-LABEL: define dso_local spir_func void @foo( -// SPIR-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT4X4:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_MAT3X3:%.*]]) align 4 [[IN:%.*]]) #[[ATTR0:[0-9]+]] { -// SPIR-NEXT: [[ENTRY:.*:]] -// SPIR-NEXT: ret void -// -// AMDGCN30-GVAR-LABEL: define dso_local %struct.Mat4X4 @foo( -// AMDGCN30-GVAR-SAME: [9 x i32] [[IN_COERCE:%.*]]) #[[ATTR0:[0-9]+]] { -// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] -// AMDGCN30-GVAR-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(5) [[IN]], i32 0, i32 0 -// AMDGCN30-GVAR-NEXT: store [9 x i32] [[IN_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 4 -// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr addrspace(5) [[RETVAL]], align 4 -// AMDGCN30-GVAR-NEXT: ret [[STRUCT_MAT4X4]] [[TMP0]] -// -// AMDGCN30-LABEL: define dso_local %struct.Mat4X4 @foo( -// AMDGCN30-SAME: [9 x i32] [[IN_COERCE:%.*]]) #[[ATTR0:[0-9]+]] { -// AMDGCN30-NEXT: [[ENTRY:.*:]] -// AMDGCN30-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) -// AMDGCN30-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5) -// AMDGCN30-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(5) [[IN]], i32 0, i32 0 -// AMDGCN30-NEXT: store [9 x i32] [[IN_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 4 -// AMDGCN30-NEXT: [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr addrspace(5) [[RETVAL]], align 4 -// AMDGCN30-NEXT: ret [[STRUCT_MAT4X4]] [[TMP0]] // -Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) { - Mat4X4 out; - return out; -} - // X86-LABEL: define spir_kernel void @ker( // X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { // X86-NEXT: [[ENTRY:.*:]] -// X86-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4 -// X86-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4 -// X86-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4 -// X86-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4 // X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4 // X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4 // X86-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4 // X86-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4 // X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4 // X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4 -// X86-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[IN_ADDR_I]], align 4 -// X86-NEXT: store ptr addrspace(1) [[TMP1]], ptr [[OUT_ADDR_I]], align 4 -// X86-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_I]], align 4 -// X86-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_I]], align 4 -// X86-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3]], ptr addrspace(1) [[TMP3]], i32 1 -// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 4 [[ARRAYIDX1_I]], i32 36, i1 false) -// X86-NEXT: call void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP_I]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4:[0-9]+]] -// X86-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[TMP2]], ptr align 4 [[TMP_I]], i32 64, i1 false) +// X86-NEXT: call void @__clang_ocl_kern_imp_ker(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR3:[0-9]+]] // X86-NEXT: ret void // -// AMDGCN-LABEL: define dso_local amdgpu_kernel void @ker( -// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { -// AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) -// AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 -// AMDGCN-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 -// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 -// AMDGCN-NEXT: store ptr addrspace(1) [[TMP0]], ptr addrspace(5) [[IN_ADDR_I]], align 8 -// AMDGCN-NEXT: store ptr addrspace(1) [[TMP1]], ptr addrspace(5) [[OUT_ADDR_I]], align 8 -// AMDGCN-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR_I]], align 8 -// AMDGCN-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR_I]], align 8 -// AMDGCN-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP3]], i64 1 -// AMDGCN-NEXT: [[TMP4:%.*]] = load [9 x i32], ptr addrspace(1) [[ARRAYIDX1_I]], align 4 -// AMDGCN-NEXT: [[CALL_I:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP4]]) #[[ATTR4:[0-9]+]] -// AMDGCN-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL_I]], 0 -// AMDGCN-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP_I]], align 4 -// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[TMP2]], ptr addrspace(5) align 4 [[TMP_I]], i64 64, i1 false) -// AMDGCN-NEXT: ret void -// -// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @ker( -// AMDGCN20-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { -// AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN20-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN20-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) -// AMDGCN20-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN20-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN20-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr -// AMDGCN20-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr -// AMDGCN20-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8 -// AMDGCN20-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 -// AMDGCN20-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8 -// AMDGCN20-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 -// AMDGCN20-NEXT: [[IN_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR_I]] to ptr -// AMDGCN20-NEXT: [[OUT_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR_I]] to ptr -// AMDGCN20-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[IN_ADDR_ASCAST_I]], align 8 -// AMDGCN20-NEXT: store ptr addrspace(1) [[TMP1]], ptr [[OUT_ADDR_ASCAST_I]], align 8 -// AMDGCN20-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST_I]], align 8 -// AMDGCN20-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST_I]], align 8 -// AMDGCN20-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP3]], i64 1 -// AMDGCN20-NEXT: [[TMP4:%.*]] = load [9 x i32], ptr addrspace(1) [[ARRAYIDX1_I]], align 4 -// AMDGCN20-NEXT: [[CALL_I:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP4]]) #[[ATTR4:[0-9]+]] -// AMDGCN20-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL_I]], 0 -// AMDGCN20-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP_I]], align 4 -// AMDGCN20-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[TMP2]], ptr addrspace(5) align 4 [[TMP_I]], i64 64, i1 false) -// AMDGCN20-NEXT: ret void -// -// SPIR-LABEL: define dso_local spir_kernel void @ker( -// SPIR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META5]] !kernel_arg_type_qual [[META6:![0-9]+]] { -// SPIR-NEXT: [[ENTRY:.*:]] -// SPIR-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4 -// SPIR-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4 -// SPIR-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4 -// SPIR-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4 -// SPIR-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4 -// SPIR-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4 -// SPIR-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4 -// SPIR-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4 -// SPIR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4 -// SPIR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4 -// SPIR-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[IN_ADDR_I]], align 4 -// SPIR-NEXT: store ptr addrspace(1) [[TMP1]], ptr [[OUT_ADDR_I]], align 4 -// SPIR-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_I]], align 4 -// SPIR-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_I]], align 4 -// SPIR-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3]], ptr addrspace(1) [[TMP3]], i32 1 -// SPIR-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 4 [[ARRAYIDX1_I]], i32 36, i1 false) -// SPIR-NEXT: call spir_func void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP_I]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4:[0-9]+]] -// SPIR-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[TMP2]], ptr align 4 [[TMP_I]], i32 64, i1 false) -// SPIR-NEXT: ret void // -// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @ker( -// AMDGCN30-GVAR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { -// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] -// AMDGCN30-GVAR-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 -// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 -// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[TMP0]], ptr addrspace(5) [[IN_ADDR_I]], align 8 -// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[TMP1]], ptr addrspace(5) [[OUT_ADDR_I]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR_I]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR_I]], align 8 -// AMDGCN30-GVAR-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP3]], i64 1 -// AMDGCN30-GVAR-NEXT: [[TMP4:%.*]] = load [9 x i32], ptr addrspace(1) [[ARRAYIDX1_I]], align 4 -// AMDGCN30-GVAR-NEXT: [[CALL_I:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP4]]) #[[ATTR4:[0-9]+]] -// AMDGCN30-GVAR-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL_I]], 0 -// AMDGCN30-GVAR-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP_I]], align 4 -// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[TMP2]], ptr addrspace(5) align 4 [[TMP_I]], i64 64, i1 false) -// AMDGCN30-GVAR-NEXT: ret void +// X86-LABEL: define void @__clang_ocl_kern_imp_ker( +// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// X86-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4 +// X86-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4 +// X86-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4 +// X86-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4 +// X86-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i32 0 +// X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4 +// X86-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3]], ptr addrspace(1) [[TMP1]], i32 1 +// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i32 36, i1 false) +// X86-NEXT: call void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]] +// X86-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 64, i1 false) +// X86-NEXT: ret void // -// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @ker( -// AMDGCN30-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { -// AMDGCN30-NEXT: [[ENTRY:.*:]] -// AMDGCN30-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN30-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN30-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) -// AMDGCN30-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN30-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN30-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 -// AMDGCN30-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// AMDGCN30-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 -// AMDGCN30-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 -// AMDGCN30-NEXT: store ptr addrspace(1) [[TMP0]], ptr addrspace(5) [[IN_ADDR_I]], align 8 -// AMDGCN30-NEXT: store ptr addrspace(1) [[TMP1]], ptr addrspace(5) [[OUT_ADDR_I]], align 8 -// AMDGCN30-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR_I]], align 8 -// AMDGCN30-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR_I]], align 8 -// AMDGCN30-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP3]], i64 1 -// AMDGCN30-NEXT: [[TMP4:%.*]] = load [9 x i32], ptr addrspace(1) [[ARRAYIDX1_I]], align 4 -// AMDGCN30-NEXT: [[CALL_I:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP4]]) #[[ATTR4:[0-9]+]] -// AMDGCN30-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL_I]], 0 -// AMDGCN30-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP_I]], align 4 -// AMDGCN30-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[TMP2]], ptr addrspace(5) align 4 [[TMP_I]], i64 64, i1 false) -// AMDGCN30-NEXT: ret void // -kernel void ker(global Mat3X3 *in, global Mat4X4 *out) { - out[0] = foo(in[1]); -} - // X86-LABEL: define void @foo_large( // X86-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_MAT32X32:%.*]]) align 4 [[IN:%.*]]) #[[ATTR0]] { // X86-NEXT: [[ENTRY:.*:]] @@ -262,191 +164,39 @@ kernel void ker(global Mat3X3 *in, global Mat4X4 *out) { // X86-NEXT: store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4 // X86-NEXT: ret void // -// AMDGCN-LABEL: define dso_local void @foo_large( -// AMDGCN-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { -// AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5) -// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false) -// AMDGCN-NEXT: ret void -// -// AMDGCN20-LABEL: define dso_local void @foo_large( -// AMDGCN20-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { -// AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5) -// AMDGCN20-NEXT: [[IN:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr -// AMDGCN20-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false) -// AMDGCN20-NEXT: ret void -// -// SPIR-LABEL: define dso_local spir_func void @foo_large( -// SPIR-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_MAT32X32:%.*]]) align 4 [[IN:%.*]]) #[[ATTR0]] { -// SPIR-NEXT: [[ENTRY:.*:]] -// SPIR-NEXT: ret void -// -// AMDGCN30-GVAR-LABEL: define dso_local void @foo_large( -// AMDGCN30-GVAR-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { -// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] -// AMDGCN30-GVAR-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5) -// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false) -// AMDGCN30-GVAR-NEXT: ret void -// -// AMDGCN30-LABEL: define dso_local void @foo_large( -// AMDGCN30-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { -// AMDGCN30-NEXT: [[ENTRY:.*:]] -// AMDGCN30-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5) -// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false) -// AMDGCN30-NEXT: ret void // -Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) { - Mat64X64 out; - return out; -} - // X86-LABEL: define spir_kernel void @ker_large( // X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { // X86-NEXT: [[ENTRY:.*:]] -// X86-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4 -// X86-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4 -// X86-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4 -// X86-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4 // X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4 // X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4 // X86-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4 // X86-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4 // X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4 // X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4 -// X86-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[IN_ADDR_I]], align 4 -// X86-NEXT: store ptr addrspace(1) [[TMP1]], ptr [[OUT_ADDR_I]], align 4 -// X86-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_I]], align 4 -// X86-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_I]], align 4 -// X86-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP3]], i32 1 -// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 4 [[ARRAYIDX1_I]], i32 4096, i1 false) -// X86-NEXT: call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_I]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4]] -// X86-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[TMP2]], ptr align 4 [[TMP_I]], i32 16384, i1 false) +// X86-NEXT: call void @__clang_ocl_kern_imp_ker_large(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR3]] // X86-NEXT: ret void // -// AMDGCN-LABEL: define dso_local amdgpu_kernel void @ker_large( -// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { -// AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) -// AMDGCN-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) -// AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 -// AMDGCN-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 -// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 -// AMDGCN-NEXT: store ptr addrspace(1) [[TMP0]], ptr addrspace(5) [[IN_ADDR_I]], align 8 -// AMDGCN-NEXT: store ptr addrspace(1) [[TMP1]], ptr addrspace(5) [[OUT_ADDR_I]], align 8 -// AMDGCN-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR_I]], align 8 -// AMDGCN-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR_I]], align 8 -// AMDGCN-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP3]], i64 1 -// AMDGCN-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 4 [[ARRAYIDX1_I]], i64 4096, i1 false) -// AMDGCN-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_I]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4]] -// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[TMP2]], ptr addrspace(5) align 4 [[TMP_I]], i64 16384, i1 false) -// AMDGCN-NEXT: ret void // -// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @ker_large( -// AMDGCN20-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { -// AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN20-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN20-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) -// AMDGCN20-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) -// AMDGCN20-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN20-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN20-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr -// AMDGCN20-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr -// AMDGCN20-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8 -// AMDGCN20-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 -// AMDGCN20-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8 -// AMDGCN20-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 -// AMDGCN20-NEXT: [[IN_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR_I]] to ptr -// AMDGCN20-NEXT: [[OUT_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR_I]] to ptr -// AMDGCN20-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[IN_ADDR_ASCAST_I]], align 8 -// AMDGCN20-NEXT: store ptr addrspace(1) [[TMP1]], ptr [[OUT_ADDR_ASCAST_I]], align 8 -// AMDGCN20-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST_I]], align 8 -// AMDGCN20-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST_I]], align 8 -// AMDGCN20-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP3]], i64 1 -// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 4 [[ARRAYIDX1_I]], i64 4096, i1 false) -// AMDGCN20-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_I]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4]] -// AMDGCN20-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[TMP2]], ptr addrspace(5) align 4 [[TMP_I]], i64 16384, i1 false) -// AMDGCN20-NEXT: ret void +// X86-LABEL: define void @__clang_ocl_kern_imp_ker_large( +// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// X86-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4 +// X86-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4 +// X86-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4 +// X86-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4 +// X86-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i32 0 +// X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4 +// X86-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i32 1 +// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i32 4096, i1 false) +// X86-NEXT: call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]] +// X86-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 16384, i1 false) +// X86-NEXT: ret void // -// SPIR-LABEL: define dso_local spir_kernel void @ker_large( -// SPIR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META7:![0-9]+]] !kernel_arg_base_type [[META7]] !kernel_arg_type_qual [[META6]] { -// SPIR-NEXT: [[ENTRY:.*:]] -// SPIR-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4 -// SPIR-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4 -// SPIR-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4 -// SPIR-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4 -// SPIR-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4 -// SPIR-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4 -// SPIR-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4 -// SPIR-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4 -// SPIR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4 -// SPIR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4 -// SPIR-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[IN_ADDR_I]], align 4 -// SPIR-NEXT: store ptr addrspace(1) [[TMP1]], ptr [[OUT_ADDR_I]], align 4 -// SPIR-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_I]], align 4 -// SPIR-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_I]], align 4 -// SPIR-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP3]], i32 1 -// SPIR-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 4 [[ARRAYIDX1_I]], i32 4096, i1 false) -// SPIR-NEXT: call spir_func void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_I]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4]] -// SPIR-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[TMP2]], ptr align 4 [[TMP_I]], i32 16384, i1 false) -// SPIR-NEXT: ret void -// -// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @ker_large( -// AMDGCN30-GVAR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { -// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] -// AMDGCN30-GVAR-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 -// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 -// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[TMP0]], ptr addrspace(5) [[IN_ADDR_I]], align 8 -// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[TMP1]], ptr addrspace(5) [[OUT_ADDR_I]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR_I]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR_I]], align 8 -// AMDGCN30-GVAR-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP3]], i64 1 -// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 4 [[ARRAYIDX1_I]], i64 4096, i1 false) -// AMDGCN30-GVAR-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_I]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4]] -// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[TMP2]], ptr addrspace(5) align 4 [[TMP_I]], i64 16384, i1 false) -// AMDGCN30-GVAR-NEXT: ret void -// -// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @ker_large( -// AMDGCN30-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { -// AMDGCN30-NEXT: [[ENTRY:.*:]] -// AMDGCN30-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN30-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN30-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) -// AMDGCN30-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) -// AMDGCN30-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN30-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN30-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 -// AMDGCN30-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// AMDGCN30-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 -// AMDGCN30-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 -// AMDGCN30-NEXT: store ptr addrspace(1) [[TMP0]], ptr addrspace(5) [[IN_ADDR_I]], align 8 -// AMDGCN30-NEXT: store ptr addrspace(1) [[TMP1]], ptr addrspace(5) [[OUT_ADDR_I]], align 8 -// AMDGCN30-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR_I]], align 8 -// AMDGCN30-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR_I]], align 8 -// AMDGCN30-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP3]], i64 1 -// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 4 [[ARRAYIDX1_I]], i64 4096, i1 false) -// AMDGCN30-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_I]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4]] -// AMDGCN30-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[TMP2]], ptr addrspace(5) align 4 [[TMP_I]], i64 16384, i1 false) -// AMDGCN30-NEXT: ret void // -kernel void ker_large(global Mat32X32 *in, global Mat64X64 *out) { - out[0] = foo_large(in[1]); -} - // X86-LABEL: define void @FuncOneMember( // X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { // X86-NEXT: [[ENTRY:.*:]] @@ -459,74 +209,7 @@ kernel void ker_large(global Mat32X32 *in, global Mat64X64 *out) { // X86-NEXT: store <2 x i32> [[TMP1]], ptr [[X]], align 8 // X86-NEXT: ret void // -// AMDGCN-LABEL: define dso_local void @FuncOneMember( -// AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR0]] { -// AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) -// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 -// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN-NEXT: store <2 x i32> [[TMP0]], ptr addrspace(5) [[X]], align 8 -// AMDGCN-NEXT: ret void -// -// AMDGCN20-LABEL: define dso_local void @FuncOneMember( -// AMDGCN20-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR0]] { -// AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) -// AMDGCN20-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr -// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr -// AMDGCN20-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 -// AMDGCN20-NEXT: store <2 x i32> [[U_COERCE]], ptr [[COERCE_DIVE]], align 8 -// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 -// AMDGCN20-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 -// AMDGCN20-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 -// AMDGCN20-NEXT: store <2 x i32> [[TMP0]], ptr [[X]], align 8 -// AMDGCN20-NEXT: ret void -// -// SPIR-LABEL: define dso_local spir_func void @FuncOneMember( -// SPIR-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR0]] { -// SPIR-NEXT: [[ENTRY:.*:]] -// SPIR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 -// SPIR-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 -// SPIR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 -// SPIR-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U]], i32 0, i32 0 -// SPIR-NEXT: store <2 x i32> [[TMP0]], ptr [[X]], align 8 -// SPIR-NEXT: ret void -// -// AMDGCN30-GVAR-LABEL: define dso_local void @FuncOneMember( -// AMDGCN30-GVAR-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR0]] { -// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] -// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN30-GVAR-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 -// AMDGCN30-GVAR-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN30-GVAR-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP0]], ptr addrspace(5) [[X]], align 8 -// AMDGCN30-GVAR-NEXT: ret void -// -// AMDGCN30-LABEL: define dso_local void @FuncOneMember( -// AMDGCN30-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR0]] { -// AMDGCN30-NEXT: [[ENTRY:.*:]] -// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN30-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) -// AMDGCN30-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN30-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 -// AMDGCN30-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN30-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN30-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN30-NEXT: store <2 x i32> [[TMP0]], ptr addrspace(5) [[X]], align 8 -// AMDGCN30-NEXT: ret void // -void FuncOneMember(struct StructOneMember u) { - u.x = (int2)(0, 0); -} - // X86-LABEL: define void @FuncOneLargeMember( // X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { // X86-NEXT: [[ENTRY:.*:]] @@ -540,442 +223,85 @@ void FuncOneMember(struct StructOneMember u) { // X86-NEXT: store <2 x i32> [[TMP1]], ptr [[ARRAYIDX]], align 8 // X86-NEXT: ret void // -// AMDGCN-LABEL: define dso_local void @FuncOneLargeMember( -// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { -// AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) -// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) -// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr addrspace(5) [[X]], i64 0, i64 0 -// AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 -// AMDGCN-NEXT: ret void -// -// AMDGCN20-LABEL: define dso_local void @FuncOneLargeMember( -// AMDGCN20-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { -// AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) -// AMDGCN20-NEXT: [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr -// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr -// AMDGCN20-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) -// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 -// AMDGCN20-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 -// AMDGCN20-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U]], i32 0, i32 0 -// AMDGCN20-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr [[X]], i64 0, i64 0 -// AMDGCN20-NEXT: store <2 x i32> [[TMP1]], ptr [[ARRAYIDX]], align 8 -// AMDGCN20-NEXT: ret void -// -// SPIR-LABEL: define dso_local spir_func void @FuncOneLargeMember( -// SPIR-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR0]] { -// SPIR-NEXT: [[ENTRY:.*:]] -// SPIR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 -// SPIR-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 -// SPIR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 -// SPIR-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U]], i32 0, i32 0 -// SPIR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr [[X]], i32 0, i32 0 -// SPIR-NEXT: store <2 x i32> [[TMP0]], ptr [[ARRAYIDX]], align 8 -// SPIR-NEXT: ret void -// -// AMDGCN30-GVAR-LABEL: define dso_local void @FuncOneLargeMember( -// AMDGCN30-GVAR-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { -// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] -// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) -// AMDGCN30-GVAR-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN30-GVAR-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN30-GVAR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr addrspace(5) [[X]], i64 0, i64 0 -// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 -// AMDGCN30-GVAR-NEXT: ret void -// -// AMDGCN30-LABEL: define dso_local void @FuncOneLargeMember( -// AMDGCN30-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { -// AMDGCN30-NEXT: [[ENTRY:.*:]] -// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN30-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) -// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) -// AMDGCN30-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN30-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN30-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN30-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr addrspace(5) [[X]], i64 0, i64 0 -// AMDGCN30-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 -// AMDGCN30-NEXT: ret void -// -void FuncOneLargeMember(struct LargeStructOneMember u) { - u.x[0] = (int2)(0, 0); -} - -#if (__OPENCL_C_VERSION__ == 200) || (__OPENCL_C_VERSION__ >= 300 && defined(__opencl_c_program_scope_global_variables)) - -// AMDGCN20-LABEL: define dso_local void @test_indirect_arg_globl( -// AMDGCN20-SAME: ) #[[ATTR0]] { -// AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(1) align 8 @g_s, i64 800, i1 false) -// AMDGCN20-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] -// AMDGCN20-NEXT: ret void -// -// AMDGCN30-GVAR-LABEL: define dso_local void @test_indirect_arg_globl( -// AMDGCN30-GVAR-SAME: ) #[[ATTR0]] { -// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] -// AMDGCN30-GVAR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(1) align 8 @g_s, i64 800, i1 false) -// AMDGCN30-GVAR-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] -// AMDGCN30-GVAR-NEXT: ret void // -void test_indirect_arg_globl(void) { - FuncOneLargeMember(g_s); -} -#endif - // X86-LABEL: define spir_kernel void @test_indirect_arg_local( // X86-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { // X86-NEXT: [[ENTRY:.*:]] -// X86-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 4 -// X86-NEXT: call void @llvm.memcpy.p0.p3.i32(ptr align 4 [[BYVAL_TEMP_I]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i32 800, i1 false) -// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4]] +// X86-NEXT: call void @__clang_ocl_kern_imp_test_indirect_arg_local() #[[ATTR3]] // X86-NEXT: ret void // -// AMDGCN-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local( -// AMDGCN-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { -// AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP_I]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) -// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP_I]]) #[[ATTR4]] -// AMDGCN-NEXT: ret void -// -// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local( -// AMDGCN20-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { -// AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP_I]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) -// AMDGCN20-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP_I]]) #[[ATTR4]] -// AMDGCN20-NEXT: ret void -// -// SPIR-LABEL: define dso_local spir_kernel void @test_indirect_arg_local( -// SPIR-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META8:![0-9]+]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META8]] { -// SPIR-NEXT: [[ENTRY:.*:]] -// SPIR-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8 -// SPIR-NEXT: call void @llvm.memcpy.p0.p3.i32(ptr align 8 [[BYVAL_TEMP_I]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i32 800, i1 false) -// SPIR-NEXT: call spir_func void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP_I]]) #[[ATTR4]] -// SPIR-NEXT: ret void // -// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local( -// AMDGCN30-GVAR-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { -// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] -// AMDGCN30-GVAR-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP_I]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) -// AMDGCN30-GVAR-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP_I]]) #[[ATTR4]] -// AMDGCN30-GVAR-NEXT: ret void +// X86-LABEL: define void @__clang_ocl_kern_imp_test_indirect_arg_local( +// X86-SAME: ) #[[ATTR0]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 4 +// X86-NEXT: call void @llvm.memcpy.p0.p3.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i32 800, i1 false) +// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]] +// X86-NEXT: ret void // -// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local( -// AMDGCN30-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { -// AMDGCN30-NEXT: [[ENTRY:.*:]] -// AMDGCN30-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP_I]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) -// AMDGCN30-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP_I]]) #[[ATTR4]] -// AMDGCN30-NEXT: ret void // -kernel void test_indirect_arg_local(void) { - local struct LargeStructOneMember l_s; - FuncOneLargeMember(l_s); -} - // X86-LABEL: define void @test_indirect_arg_private( // X86-SAME: ) #[[ATTR0]] { // X86-NEXT: [[ENTRY:.*:]] // X86-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8 -// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[P_S]]) #[[ATTR4]] +// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[P_S]]) #[[ATTR3]] // X86-NEXT: ret void // -// AMDGCN-LABEL: define dso_local void @test_indirect_arg_private( -// AMDGCN-SAME: ) #[[ATTR0]] { -// AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR4]] -// AMDGCN-NEXT: ret void // -// AMDGCN20-LABEL: define dso_local void @test_indirect_arg_private( -// AMDGCN20-SAME: ) #[[ATTR0]] { -// AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN20-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN20-NEXT: [[P_S_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P_S]] to ptr -// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr align 8 [[P_S_ASCAST]], i64 800, i1 false) -// AMDGCN20-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] -// AMDGCN20-NEXT: ret void +// X86-LABEL: define spir_kernel void @KernelOneMember( +// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: call void @__clang_ocl_kern_imp_KernelOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR3]] +// X86-NEXT: ret void // -// SPIR-LABEL: define dso_local spir_func void @test_indirect_arg_private( -// SPIR-SAME: ) #[[ATTR0]] { -// SPIR-NEXT: [[ENTRY:.*:]] -// SPIR-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8 -// SPIR-NEXT: call spir_func void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR4]] -// SPIR-NEXT: ret void // -// AMDGCN30-GVAR-LABEL: define dso_local void @test_indirect_arg_private( -// AMDGCN30-GVAR-SAME: ) #[[ATTR0]] { -// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] -// AMDGCN30-GVAR-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR4]] -// AMDGCN30-GVAR-NEXT: ret void +// X86-LABEL: define void @__clang_ocl_kern_imp_KernelOneMember( +// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 8, i1 false) +// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR3]] +// X86-NEXT: ret void // -// AMDGCN30-LABEL: define dso_local void @test_indirect_arg_private( -// AMDGCN30-SAME: ) #[[ATTR0]] { -// AMDGCN30-NEXT: [[ENTRY:.*:]] -// AMDGCN30-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN30-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR4]] -// AMDGCN30-NEXT: ret void -// -void test_indirect_arg_private(void) { - struct LargeStructOneMember p_s; - FuncOneLargeMember(p_s); -} - -// X86-LABEL: define spir_kernel void @KernelOneMember( -// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { -// X86-NEXT: [[ENTRY:.*:]] -// X86-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8 -// X86-NEXT: [[U1:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8 -// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[U1]], ptr align 1 [[U]], i64 8, i1 false) -// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I]], ptr align 4 [[U1]], i32 8, i1 false) -// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U_I]]) #[[ATTR4]] -// X86-NEXT: ret void -// -// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMember( -// AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { -// AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 -// AMDGCN-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8 -// AMDGCN-NEXT: store <2 x i32> [[TMP0]], ptr addrspace(5) [[U_I]], align 8 -// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[U_I]], align 8 -// AMDGCN-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR4]] -// AMDGCN-NEXT: ret void -// -// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelOneMember( -// AMDGCN20-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { -// AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN20-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr -// AMDGCN20-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 -// AMDGCN20-NEXT: store <2 x i32> [[U_COERCE]], ptr [[COERCE_DIVE]], align 8 -// AMDGCN20-NEXT: [[COERCE_DIVE2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 -// AMDGCN20-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[COERCE_DIVE2]], align 8 -// AMDGCN20-NEXT: [[U1_I:%.*]] = addrspacecast ptr addrspace(5) [[U_I]] to ptr -// AMDGCN20-NEXT: store <2 x i32> [[TMP0]], ptr [[U1_I]], align 8 -// AMDGCN20-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[U1_I]], align 8 -// AMDGCN20-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR4]] -// AMDGCN20-NEXT: ret void -// -// SPIR-LABEL: define dso_local spir_kernel void @KernelOneMember( -// SPIR-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META10:![0-9]+]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META11]] !kernel_arg_type_qual [[META12:![0-9]+]] { -// SPIR-NEXT: [[ENTRY:.*:]] -// SPIR-NEXT: [[U1:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8 -// SPIR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[U1]], ptr align 1 [[U]], i64 8, i1 false) -// SPIR-NEXT: call spir_func void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 8 [[U1]]) #[[ATTR4]] -// SPIR-NEXT: ret void -// -// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelOneMember( -// AMDGCN30-GVAR-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { -// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] -// AMDGCN30-GVAR-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN30-GVAR-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 -// AMDGCN30-GVAR-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8 -// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP0]], ptr addrspace(5) [[U_I]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[U_I]], align 8 -// AMDGCN30-GVAR-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR4]] -// AMDGCN30-GVAR-NEXT: ret void -// -// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelOneMember( -// AMDGCN30-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { -// AMDGCN30-NEXT: [[ENTRY:.*:]] -// AMDGCN30-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN30-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN30-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 -// AMDGCN30-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN30-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8 -// AMDGCN30-NEXT: store <2 x i32> [[TMP0]], ptr addrspace(5) [[U_I]], align 8 -// AMDGCN30-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[U_I]], align 8 -// AMDGCN30-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR4]] -// AMDGCN30-NEXT: ret void // -kernel void KernelOneMember(struct StructOneMember u) { - FuncOneMember(u); -} - // X86-LABEL: define spir_kernel void @KernelOneMemberSpir( // X86-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { // X86-NEXT: [[ENTRY:.*:]] -// X86-NEXT: [[U_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4 -// X86-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 4 // X86-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 4 // X86-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR]], align 4 // X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR]], align 4 -// X86-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[U_ADDR_I]], align 4 -// X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[U_ADDR_I]], align 4 -// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 8 [[TMP1]], i32 8, i1 false) -// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4]] +// X86-NEXT: call void @__clang_ocl_kern_imp_KernelOneMemberSpir(ptr addrspace(1) noundef align 8 [[TMP0]]) #[[ATTR3]] // X86-NEXT: ret void // -// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir( -// AMDGCN-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { -// AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[U_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8 -// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8 -// AMDGCN-NEXT: store ptr addrspace(1) [[TMP0]], ptr addrspace(5) [[U_ADDR_I]], align 8 -// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR_I]], align 8 -// AMDGCN-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(1) [[TMP1]], align 8 -// AMDGCN-NEXT: call void @FuncOneMember(<2 x i32> [[TMP2]]) #[[ATTR4]] -// AMDGCN-NEXT: ret void -// -// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir( -// AMDGCN20-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { -// AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[U_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN20-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN20-NEXT: [[U_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[U_ADDR]] to ptr -// AMDGCN20-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR_ASCAST]], align 8 -// AMDGCN20-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR_ASCAST]], align 8 -// AMDGCN20-NEXT: [[U_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[U_ADDR_I]] to ptr -// AMDGCN20-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[U_ADDR_ASCAST_I]], align 8 -// AMDGCN20-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[U_ADDR_ASCAST_I]], align 8 -// AMDGCN20-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(1) [[TMP1]], align 8 -// AMDGCN20-NEXT: call void @FuncOneMember(<2 x i32> [[TMP2]]) #[[ATTR4]] -// AMDGCN20-NEXT: ret void // -// SPIR-LABEL: define dso_local spir_kernel void @KernelOneMemberSpir( -// SPIR-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META13:![0-9]+]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META14:![0-9]+]] !kernel_arg_base_type [[META14]] !kernel_arg_type_qual [[META12]] { -// SPIR-NEXT: [[ENTRY:.*:]] -// SPIR-NEXT: [[U_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4 -// SPIR-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8 -// SPIR-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 4 -// SPIR-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR]], align 4 -// SPIR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR]], align 4 -// SPIR-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[U_ADDR_I]], align 4 -// SPIR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[U_ADDR_I]], align 4 -// SPIR-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 8 [[BYVAL_TEMP_I]], ptr addrspace(1) align 8 [[TMP1]], i32 8, i1 false) -// SPIR-NEXT: call spir_func void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP_I]]) #[[ATTR4]] -// SPIR-NEXT: ret void -// -// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir( -// AMDGCN30-GVAR-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { -// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] -// AMDGCN30-GVAR-NEXT: [[U_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8 -// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[TMP0]], ptr addrspace(5) [[U_ADDR_I]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR_I]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(1) [[TMP1]], align 8 -// AMDGCN30-GVAR-NEXT: call void @FuncOneMember(<2 x i32> [[TMP2]]) #[[ATTR4]] -// AMDGCN30-GVAR-NEXT: ret void +// X86-LABEL: define void @__clang_ocl_kern_imp_KernelOneMemberSpir( +// X86-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META14]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// X86-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 4 +// X86-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR]], align 4 +// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 8 [[TMP0]], i32 8, i1 false) +// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]] +// X86-NEXT: ret void // -// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir( -// AMDGCN30-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { -// AMDGCN30-NEXT: [[ENTRY:.*:]] -// AMDGCN30-NEXT: [[U_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN30-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN30-NEXT: store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8 -// AMDGCN30-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8 -// AMDGCN30-NEXT: store ptr addrspace(1) [[TMP0]], ptr addrspace(5) [[U_ADDR_I]], align 8 -// AMDGCN30-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR_I]], align 8 -// AMDGCN30-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(1) [[TMP1]], align 8 -// AMDGCN30-NEXT: call void @FuncOneMember(<2 x i32> [[TMP2]]) #[[ATTR4]] -// AMDGCN30-NEXT: ret void // -kernel void KernelOneMemberSpir(global struct StructOneMember* u) { - FuncOneMember(*u); -} - // X86-LABEL: define spir_kernel void @KernelLargeOneMember( // X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { // X86-NEXT: [[ENTRY:.*:]] -// X86-NEXT: [[U_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8 -// X86-NEXT: [[U1:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8 -// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[U1]], ptr align 1 [[U]], i64 800, i1 false) -// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I]], ptr align 4 [[U1]], i32 800, i1 false) -// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U_I]]) #[[ATTR4]] +// X86-NEXT: call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR3]] // X86-NEXT: ret void // -// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember( -// AMDGCN-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { -// AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[U_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 -// AMDGCN-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 -// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U_I]], ptr addrspace(5) align 8 [[U]], i64 800, i1 false) -// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U_I]]) #[[ATTR4]] -// AMDGCN-NEXT: ret void -// -// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember( -// AMDGCN20-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { -// AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[COERCE_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN20-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN20-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr -// AMDGCN20-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 -// AMDGCN20-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 -// AMDGCN20-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr [[TMP0]], align 8 -// AMDGCN20-NEXT: [[U1_ASCAST:%.*]] = addrspacecast ptr [[U1]] to ptr addrspace(5) -// AMDGCN20-NEXT: [[U_I:%.*]] = addrspacecast ptr addrspace(5) [[COERCE_I]] to ptr -// AMDGCN20-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 8 [[U_I]], ptr addrspace(5) align 8 [[U1_ASCAST]], i64 800, i1 false) -// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP_I]], ptr align 8 [[U_I]], i64 800, i1 false) -// AMDGCN20-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP_I]]) #[[ATTR4]] -// AMDGCN20-NEXT: ret void -// -// SPIR-LABEL: define dso_local spir_kernel void @KernelLargeOneMember( -// SPIR-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META12]] { -// SPIR-NEXT: [[ENTRY:.*:]] -// SPIR-NEXT: [[U1:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8 -// SPIR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[U1]], ptr align 1 [[U]], i64 800, i1 false) -// SPIR-NEXT: call spir_func void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U1]]) #[[ATTR4]] -// SPIR-NEXT: ret void // -// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember( -// AMDGCN30-GVAR-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { -// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] -// AMDGCN30-GVAR-NEXT: [[U_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 -// AMDGCN30-GVAR-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 -// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U_I]], ptr addrspace(5) align 8 [[U]], i64 800, i1 false) -// AMDGCN30-GVAR-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U_I]]) #[[ATTR4]] -// AMDGCN30-GVAR-NEXT: ret void +// X86-LABEL: define void @__clang_ocl_kern_imp_KernelLargeOneMember( +// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 800, i1 false) +// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR3]] +// X86-NEXT: ret void // -// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember( -// AMDGCN30-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { -// AMDGCN30-NEXT: [[ENTRY:.*:]] -// AMDGCN30-NEXT: [[U_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN30-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN30-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 -// AMDGCN30-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 -// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U_I]], ptr addrspace(5) align 8 [[U]], i64 800, i1 false) -// AMDGCN30-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U_I]]) #[[ATTR4]] -// AMDGCN30-NEXT: ret void // -kernel void KernelLargeOneMember(struct LargeStructOneMember u) { - FuncOneLargeMember(u); -} - // X86-LABEL: define void @FuncTwoMember( // X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { // X86-NEXT: [[ENTRY:.*:]] @@ -988,62 +314,1409 @@ kernel void KernelLargeOneMember(struct LargeStructOneMember u) { // X86-NEXT: store <2 x i32> [[TMP1]], ptr [[Y]], align 8 // X86-NEXT: ret void // -// AMDGCN-LABEL: define dso_local void @FuncTwoMember( -// AMDGCN-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR0]] { +// +// X86-LABEL: define void @FuncLargeTwoMember( +// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 +// X86-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 480, i1 false) +// X86-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// X86-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// X86-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1 +// X86-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr [[Y]], i32 0, i32 0 +// X86-NEXT: store <2 x i32> [[TMP1]], ptr [[ARRAYIDX]], align 8 +// X86-NEXT: ret void +// +// +// X86-LABEL: define spir_kernel void @KernelTwoMember( +// X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: call void @__clang_ocl_kern_imp_KernelTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR3]] +// X86-NEXT: ret void +// +// +// X86-LABEL: define void @__clang_ocl_kern_imp_KernelTwoMember( +// X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 16, i1 false) +// X86-NEXT: call void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR3]] +// X86-NEXT: ret void +// +// +// X86-LABEL: define spir_kernel void @KernelLargeTwoMember( +// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR3]] +// X86-NEXT: ret void +// +// +// X86-LABEL: define void @__clang_ocl_kern_imp_KernelLargeTwoMember( +// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 480, i1 false) +// X86-NEXT: call void @FuncLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR3]] +// X86-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local %struct.Mat4X4 @foo( +// AMDGCN-SAME: [9 x i32] [[IN_COERCE:%.*]]) #[[ATTR0:[0-9]+]] { // AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) -// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN-NEXT: store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8 -// AMDGCN-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN-NEXT: store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8 -// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(5) [[Y]], align 8 +// AMDGCN-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(5) [[IN]], i32 0, i32 0 +// AMDGCN-NEXT: store [9 x i32] [[IN_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 4 +// AMDGCN-NEXT: [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr addrspace(5) [[RETVAL]], align 4 +// AMDGCN-NEXT: ret [[STRUCT_MAT4X4]] [[TMP0]] +// +// +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @ker( +// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ker(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR4:[0-9]+]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_ker( +// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i64 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1 +// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4 +// AMDGCN-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR4]] +// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0 +// AMDGCN-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4 +// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false) +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local void @foo_large( +// AMDGCN-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false) // AMDGCN-NEXT: ret void // -// AMDGCN20-LABEL: define dso_local void @FuncTwoMember( -// AMDGCN20-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR0]] { -// AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) -// AMDGCN20-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr -// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr -// AMDGCN20-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 -// AMDGCN20-NEXT: store <2 x i32> [[U_COERCE0]], ptr [[TMP0]], align 8 -// AMDGCN20-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 -// AMDGCN20-NEXT: store <2 x i32> [[U_COERCE1]], ptr [[TMP1]], align 8 -// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 -// AMDGCN20-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 -// AMDGCN20-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 -// AMDGCN20-NEXT: store <2 x i32> [[TMP2]], ptr [[Y]], align 8 -// AMDGCN20-NEXT: ret void // -// SPIR-LABEL: define dso_local spir_func void @FuncTwoMember( -// SPIR-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR0]] { -// SPIR-NEXT: [[ENTRY:.*:]] -// SPIR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 -// SPIR-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 -// SPIR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 -// SPIR-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1 -// SPIR-NEXT: store <2 x i32> [[TMP0]], ptr [[Y]], align 8 -// SPIR-NEXT: ret void +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @ker_large( +// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ker_large(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_ker_large( +// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i64 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1 +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false) +// AMDGCN-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR4]] +// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false) +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local void @FuncOneMember( +// AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR0]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 +// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[TMP0]], ptr addrspace(5) [[X]], align 8 +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local void @FuncOneLargeMember( +// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) +// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr addrspace(5) [[X]], i64 0, i64 0 +// AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local( +// AMDGCN-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_test_indirect_arg_local() #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_test_indirect_arg_local( +// AMDGCN-SAME: ) #[[ATTR2]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) +// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local void @test_indirect_arg_private( +// AMDGCN-SAME: ) #[[ATTR0]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMember( +// AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 +// AMDGCN-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelOneMember(<2 x i32> [[TMP0]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelOneMember( +// AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 +// AMDGCN-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8 +// AMDGCN-NEXT: call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir( +// AMDGCN-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelOneMemberSpir(ptr addrspace(1) noundef align 8 [[TMP0]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelOneMemberSpir( +// AMDGCN-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META14]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8 +// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER:%.*]], ptr addrspace(1) [[TMP0]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[COERCE_DIVE]], align 8 +// AMDGCN-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember( +// AMDGCN-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 +// AMDGCN-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelLargeOneMember( +// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) +// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local void @FuncTwoMember( +// AMDGCN-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR0]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8 +// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(5) [[Y]], align 8 +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local void @FuncLargeTwoMember( +// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false) +// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr addrspace(5) [[Y]], i64 0, i64 0 +// AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember( +// AMDGCN-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0 +// AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1 +// AMDGCN-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8 +// AMDGCN-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelTwoMember( +// AMDGCN-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8 +// AMDGCN-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP3]], <2 x i32> [[TMP5]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember( +// AMDGCN-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0 +// AMDGCN-NEXT: store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1 +// AMDGCN-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelLargeTwoMember( +// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false) +// AMDGCN-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local %struct.Mat4X4 @foo( +// AMDGCN20-SAME: [9 x i32] [[IN_COERCE:%.*]]) #[[ATTR0:[0-9]+]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) +// AMDGCN20-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5) +// AMDGCN20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN20-NEXT: [[IN1:%.*]] = addrspacecast ptr addrspace(5) [[IN]] to ptr +// AMDGCN20-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr [[IN1]], i32 0, i32 0 +// AMDGCN20-NEXT: store [9 x i32] [[IN_COERCE]], ptr [[COERCE_DIVE]], align 4 +// AMDGCN20-NEXT: [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr [[RETVAL_ASCAST]], align 4 +// AMDGCN20-NEXT: ret [[STRUCT_MAT4X4]] [[TMP0]] +// +// +// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @ker( +// AMDGCN20-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN20-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN20-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr +// AMDGCN20-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// AMDGCN20-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: call void @__clang_ocl_kern_imp_ker(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR4:[0-9]+]] +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local void @__clang_ocl_kern_imp_ker( +// AMDGCN20-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN20-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN20-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) +// AMDGCN20-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr +// AMDGCN20-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// AMDGCN20-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i64 0 +// AMDGCN20-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1 +// AMDGCN20-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0 +// AMDGCN20-NEXT: [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4 +// AMDGCN20-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR4]] +// AMDGCN20-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0 +// AMDGCN20-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0 +// AMDGCN20-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4 +// AMDGCN20-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false) +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local void @foo_large( +// AMDGCN20-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5) +// AMDGCN20-NEXT: [[IN:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr +// AMDGCN20-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false) +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @ker_large( +// AMDGCN20-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN20-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN20-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr +// AMDGCN20-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// AMDGCN20-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: call void @__clang_ocl_kern_imp_ker_large(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR4]] +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local void @__clang_ocl_kern_imp_ker_large( +// AMDGCN20-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN20-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN20-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) +// AMDGCN20-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) +// AMDGCN20-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr +// AMDGCN20-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// AMDGCN20-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i64 0 +// AMDGCN20-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1 +// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false) +// AMDGCN20-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR4]] +// AMDGCN20-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false) +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local void @FuncOneMember( +// AMDGCN20-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR0]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN20-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr +// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr +// AMDGCN20-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 +// AMDGCN20-NEXT: store <2 x i32> [[U_COERCE]], ptr [[COERCE_DIVE]], align 8 +// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 +// AMDGCN20-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 +// AMDGCN20-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 +// AMDGCN20-NEXT: store <2 x i32> [[TMP0]], ptr [[X]], align 8 +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local void @FuncOneLargeMember( +// AMDGCN20-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN20-NEXT: [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr +// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr +// AMDGCN20-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) +// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 +// AMDGCN20-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 +// AMDGCN20-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U]], i32 0, i32 0 +// AMDGCN20-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr [[X]], i64 0, i64 0 +// AMDGCN20-NEXT: store <2 x i32> [[TMP1]], ptr [[ARRAYIDX]], align 8 +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local void @test_indirect_arg_globl( +// AMDGCN20-SAME: ) #[[ATTR0]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(1) align 8 @g_s, i64 800, i1 false) +// AMDGCN20-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local( +// AMDGCN20-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: call void @__clang_ocl_kern_imp_test_indirect_arg_local() #[[ATTR4]] +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local void @__clang_ocl_kern_imp_test_indirect_arg_local( +// AMDGCN20-SAME: ) #[[ATTR2]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) +// AMDGCN20-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local void @test_indirect_arg_private( +// AMDGCN20-SAME: ) #[[ATTR0]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[P_S_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P_S]] to ptr +// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr align 8 [[P_S_ASCAST]], i64 800, i1 false) +// AMDGCN20-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelOneMember( +// AMDGCN20-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr +// AMDGCN20-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 +// AMDGCN20-NEXT: store <2 x i32> [[U_COERCE]], ptr [[COERCE_DIVE]], align 8 +// AMDGCN20-NEXT: [[COERCE_DIVE2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 +// AMDGCN20-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[COERCE_DIVE2]], align 8 +// AMDGCN20-NEXT: call void @__clang_ocl_kern_imp_KernelOneMember(<2 x i32> [[TMP0]]) #[[ATTR4]] +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelOneMember( +// AMDGCN20-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr +// AMDGCN20-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 +// AMDGCN20-NEXT: store <2 x i32> [[U_COERCE]], ptr [[COERCE_DIVE]], align 8 +// AMDGCN20-NEXT: [[COERCE_DIVE2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 +// AMDGCN20-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[COERCE_DIVE2]], align 8 +// AMDGCN20-NEXT: call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR4]] +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir( +// AMDGCN20-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN20-NEXT: [[U_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[U_ADDR]] to ptr +// AMDGCN20-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: call void @__clang_ocl_kern_imp_KernelOneMemberSpir(ptr addrspace(1) noundef align 8 [[TMP0]]) #[[ATTR4]] +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelOneMemberSpir( +// AMDGCN20-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META14]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN20-NEXT: [[U_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[U_ADDR]] to ptr +// AMDGCN20-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER:%.*]], ptr addrspace(1) [[TMP0]], i32 0, i32 0 +// AMDGCN20-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[COERCE_DIVE]], align 8 +// AMDGCN20-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR4]] +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember( +// AMDGCN20-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr +// AMDGCN20-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 +// AMDGCN20-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 +// AMDGCN20-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr [[TMP0]], align 8 +// AMDGCN20-NEXT: [[U1_ASCAST:%.*]] = addrspacecast ptr [[U1]] to ptr addrspace(5) +// AMDGCN20-NEXT: call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U1_ASCAST]]) #[[ATTR4]] +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelLargeOneMember( +// AMDGCN20-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr +// AMDGCN20-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) +// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr align 8 [[U]], i64 800, i1 false) +// AMDGCN20-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local void @FuncTwoMember( +// AMDGCN20-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR0]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN20-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr +// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr +// AMDGCN20-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 +// AMDGCN20-NEXT: store <2 x i32> [[U_COERCE0]], ptr [[TMP0]], align 8 +// AMDGCN20-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 +// AMDGCN20-NEXT: store <2 x i32> [[U_COERCE1]], ptr [[TMP1]], align 8 +// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 +// AMDGCN20-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 +// AMDGCN20-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 +// AMDGCN20-NEXT: store <2 x i32> [[TMP2]], ptr [[Y]], align 8 +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local void @FuncLargeTwoMember( +// AMDGCN20-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN20-NEXT: [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr +// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr +// AMDGCN20-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false) +// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 +// AMDGCN20-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 +// AMDGCN20-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1 +// AMDGCN20-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr [[Y]], i64 0, i64 0 +// AMDGCN20-NEXT: store <2 x i32> [[TMP1]], ptr [[ARRAYIDX]], align 8 +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember( +// AMDGCN20-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr +// AMDGCN20-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 +// AMDGCN20-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0 +// AMDGCN20-NEXT: store <2 x i32> [[TMP1]], ptr [[TMP0]], align 8 +// AMDGCN20-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 +// AMDGCN20-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1 +// AMDGCN20-NEXT: store <2 x i32> [[TMP3]], ptr [[TMP2]], align 8 +// AMDGCN20-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 +// AMDGCN20-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr [[TMP4]], align 8 +// AMDGCN20-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 +// AMDGCN20-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8 +// AMDGCN20-NEXT: call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR4]] +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelTwoMember( +// AMDGCN20-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr +// AMDGCN20-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 +// AMDGCN20-NEXT: store <2 x i32> [[U_COERCE0]], ptr [[TMP0]], align 8 +// AMDGCN20-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 +// AMDGCN20-NEXT: store <2 x i32> [[U_COERCE1]], ptr [[TMP1]], align 8 +// AMDGCN20-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 +// AMDGCN20-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[TMP2]], align 8 +// AMDGCN20-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 +// AMDGCN20-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr [[TMP4]], align 8 +// AMDGCN20-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP3]], <2 x i32> [[TMP5]]) #[[ATTR4]] +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember( +// AMDGCN20-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr +// AMDGCN20-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 +// AMDGCN20-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0 +// AMDGCN20-NEXT: store [40 x <2 x i32>] [[TMP1]], ptr [[TMP0]], align 8 +// AMDGCN20-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 +// AMDGCN20-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1 +// AMDGCN20-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr [[TMP2]], align 8 +// AMDGCN20-NEXT: [[U1_ASCAST:%.*]] = addrspacecast ptr [[U1]] to ptr addrspace(5) +// AMDGCN20-NEXT: call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U1_ASCAST]]) #[[ATTR4]] +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelLargeTwoMember( +// AMDGCN20-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr +// AMDGCN20-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false) +// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr align 8 [[U]], i64 480, i1 false) +// AMDGCN20-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] +// AMDGCN20-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_func void @foo( +// SPIR-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT4X4:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_MAT3X3:%.*]]) align 4 [[IN:%.*]]) #[[ATTR0:[0-9]+]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_kernel void @ker( +// SPIR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META5]] !kernel_arg_type_qual [[META6:![0-9]+]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4 +// SPIR-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4 +// SPIR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4 +// SPIR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4 +// SPIR-NEXT: call spir_func void @__clang_ocl_kern_imp_ker(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR3:[0-9]+]] +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_func void @__clang_ocl_kern_imp_ker( +// SPIR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META5]] !kernel_arg_base_type [[META5]] !kernel_arg_type_qual [[META6]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4 +// SPIR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4 +// SPIR-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4 +// SPIR-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4 +// SPIR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4 +// SPIR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i32 0 +// SPIR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4 +// SPIR-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3]], ptr addrspace(1) [[TMP1]], i32 1 +// SPIR-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i32 36, i1 false) +// SPIR-NEXT: call spir_func void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[BYVAL_TEMP]]) #[[ATTR4:[0-9]+]] +// SPIR-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 64, i1 false) +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_func void @foo_large( +// SPIR-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_MAT32X32:%.*]]) align 4 [[IN:%.*]]) #[[ATTR0]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_kernel void @ker_large( +// SPIR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META7:![0-9]+]] !kernel_arg_base_type [[META7]] !kernel_arg_type_qual [[META6]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4 +// SPIR-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4 +// SPIR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4 +// SPIR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4 +// SPIR-NEXT: call spir_func void @__clang_ocl_kern_imp_ker_large(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR3]] +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_func void @__clang_ocl_kern_imp_ker_large( +// SPIR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META7]] !kernel_arg_base_type [[META7]] !kernel_arg_type_qual [[META6]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4 +// SPIR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4 +// SPIR-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4 +// SPIR-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4 +// SPIR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4 +// SPIR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i32 0 +// SPIR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4 +// SPIR-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i32 1 +// SPIR-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i32 4096, i1 false) +// SPIR-NEXT: call spir_func void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR4]] +// SPIR-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 16384, i1 false) +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_func void @FuncOneMember( +// SPIR-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR0]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 +// SPIR-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// SPIR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// SPIR-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U]], i32 0, i32 0 +// SPIR-NEXT: store <2 x i32> [[TMP0]], ptr [[X]], align 8 +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_func void @FuncOneLargeMember( +// SPIR-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR0]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 +// SPIR-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// SPIR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// SPIR-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U]], i32 0, i32 0 +// SPIR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr [[X]], i32 0, i32 0 +// SPIR-NEXT: store <2 x i32> [[TMP0]], ptr [[ARRAYIDX]], align 8 +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_kernel void @test_indirect_arg_local( +// SPIR-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META8:![0-9]+]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META8]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: call spir_func void @__clang_ocl_kern_imp_test_indirect_arg_local() #[[ATTR3]] +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_func void @__clang_ocl_kern_imp_test_indirect_arg_local( +// SPIR-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META8]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META8]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8 +// SPIR-NEXT: call void @llvm.memcpy.p0.p3.i32(ptr align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i32 800, i1 false) +// SPIR-NEXT: call spir_func void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_func void @test_indirect_arg_private( +// SPIR-SAME: ) #[[ATTR0]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8 +// SPIR-NEXT: call spir_func void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR4]] +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_kernel void @KernelOneMember( +// SPIR-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META10:![0-9]+]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META11]] !kernel_arg_type_qual [[META12:![0-9]+]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: call spir_func void @__clang_ocl_kern_imp_KernelOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]] +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_func void @__clang_ocl_kern_imp_KernelOneMember( +// SPIR-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META11]] !kernel_arg_base_type [[META11]] !kernel_arg_type_qual [[META12]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: call spir_func void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR4]] +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_kernel void @KernelOneMemberSpir( +// SPIR-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META13:![0-9]+]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META14:![0-9]+]] !kernel_arg_base_type [[META14]] !kernel_arg_type_qual [[META12]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR]], align 4 +// SPIR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR]], align 4 +// SPIR-NEXT: call spir_func void @__clang_ocl_kern_imp_KernelOneMemberSpir(ptr addrspace(1) noundef align 8 [[TMP0]]) #[[ATTR3]] +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_func void @__clang_ocl_kern_imp_KernelOneMemberSpir( +// SPIR-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META14]] !kernel_arg_base_type [[META14]] !kernel_arg_type_qual [[META12]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8 +// SPIR-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR]], align 4 +// SPIR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR]], align 4 +// SPIR-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 8 [[BYVAL_TEMP]], ptr addrspace(1) align 8 [[TMP0]], i32 8, i1 false) +// SPIR-NEXT: call spir_func void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_kernel void @KernelLargeOneMember( +// SPIR-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META12]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: call spir_func void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]] +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_func void @__clang_ocl_kern_imp_KernelLargeOneMember( +// SPIR-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META15]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META12]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: call spir_func void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR4]] +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_func void @FuncTwoMember( +// SPIR-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR0]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 +// SPIR-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// SPIR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// SPIR-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1 +// SPIR-NEXT: store <2 x i32> [[TMP0]], ptr [[Y]], align 8 +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_func void @FuncLargeTwoMember( +// SPIR-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR0]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 +// SPIR-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// SPIR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// SPIR-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1 +// SPIR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr [[Y]], i32 0, i32 0 +// SPIR-NEXT: store <2 x i32> [[TMP0]], ptr [[ARRAYIDX]], align 8 +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_kernel void @KernelTwoMember( +// SPIR-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META12]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: call spir_func void @__clang_ocl_kern_imp_KernelTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]] +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_func void @__clang_ocl_kern_imp_KernelTwoMember( +// SPIR-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META16]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META12]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: call spir_func void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR4]] +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_kernel void @KernelLargeTwoMember( +// SPIR-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META12]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: call spir_func void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]] +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_func void @__clang_ocl_kern_imp_KernelLargeTwoMember( +// SPIR-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META17]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META12]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: call spir_func void @FuncLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR4]] +// SPIR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local %struct.Mat4X4 @foo( +// AMDGCN30-GVAR-SAME: [9 x i32] [[IN_COERCE:%.*]]) #[[ATTR0:[0-9]+]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(5) [[IN]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: store [9 x i32] [[IN_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 4 +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr addrspace(5) [[RETVAL]], align 4 +// AMDGCN30-GVAR-NEXT: ret [[STRUCT_MAT4X4]] [[TMP0]] +// +// +// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @ker( +// AMDGCN30-GVAR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: call void @__clang_ocl_kern_imp_ker(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR4:[0-9]+]] +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local void @__clang_ocl_kern_imp_ker( +// AMDGCN30-GVAR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) +// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i64 0 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1 +// AMDGCN30-GVAR-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4 +// AMDGCN30-GVAR-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR4]] +// AMDGCN30-GVAR-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0 +// AMDGCN30-GVAR-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4 +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false) +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local void @foo_large( +// AMDGCN30-GVAR-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5) +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false) +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @ker_large( +// AMDGCN30-GVAR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: call void @__clang_ocl_kern_imp_ker_large(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR4]] +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local void @__clang_ocl_kern_imp_ker_large( +// AMDGCN30-GVAR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) +// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i64 0 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1 +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false) +// AMDGCN30-GVAR-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR4]] +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false) +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local void @FuncOneMember( +// AMDGCN30-GVAR-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR0]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 +// AMDGCN30-GVAR-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-GVAR-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP0]], ptr addrspace(5) [[X]], align 8 +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local void @FuncOneLargeMember( +// AMDGCN30-GVAR-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) +// AMDGCN30-GVAR-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-GVAR-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr addrspace(5) [[X]], i64 0, i64 0 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local void @test_indirect_arg_globl( +// AMDGCN30-GVAR-SAME: ) #[[ATTR0]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(1) align 8 @g_s, i64 800, i1 false) +// AMDGCN30-GVAR-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local( +// AMDGCN30-GVAR-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: call void @__clang_ocl_kern_imp_test_indirect_arg_local() #[[ATTR4]] +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local void @__clang_ocl_kern_imp_test_indirect_arg_local( +// AMDGCN30-GVAR-SAME: ) #[[ATTR2]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) +// AMDGCN30-GVAR-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local void @test_indirect_arg_private( +// AMDGCN30-GVAR-SAME: ) #[[ATTR0]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR4]] +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelOneMember( +// AMDGCN30-GVAR-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 +// AMDGCN30-GVAR-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8 +// AMDGCN30-GVAR-NEXT: call void @__clang_ocl_kern_imp_KernelOneMember(<2 x i32> [[TMP0]]) #[[ATTR4]] +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelOneMember( +// AMDGCN30-GVAR-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 +// AMDGCN30-GVAR-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8 +// AMDGCN30-GVAR-NEXT: call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR4]] +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir( +// AMDGCN30-GVAR-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: call void @__clang_ocl_kern_imp_KernelOneMemberSpir(ptr addrspace(1) noundef align 8 [[TMP0]]) #[[ATTR4]] +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelOneMemberSpir( +// AMDGCN30-GVAR-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META14]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER:%.*]], ptr addrspace(1) [[TMP0]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[COERCE_DIVE]], align 8 +// AMDGCN30-GVAR-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR4]] +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember( +// AMDGCN30-GVAR-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 +// AMDGCN30-GVAR-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN30-GVAR-NEXT: call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR4]] +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelLargeOneMember( +// AMDGCN30-GVAR-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) +// AMDGCN30-GVAR-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR4]] +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local void @FuncTwoMember( +// AMDGCN30-GVAR-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR0]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8 +// AMDGCN30-GVAR-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-GVAR-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(5) [[Y]], align 8 +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local void @FuncLargeTwoMember( +// AMDGCN30-GVAR-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false) +// AMDGCN30-GVAR-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-GVAR-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN30-GVAR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr addrspace(5) [[Y]], i64 0, i64 0 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember( +// AMDGCN30-GVAR-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN30-GVAR-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN30-GVAR-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8 +// AMDGCN30-GVAR-NEXT: call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR4]] +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelTwoMember( +// AMDGCN30-GVAR-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN30-GVAR-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8 +// AMDGCN30-GVAR-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP3]], <2 x i32> [[TMP5]]) #[[ATTR4]] +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember( +// AMDGCN30-GVAR-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0 +// AMDGCN30-GVAR-NEXT: store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN30-GVAR-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1 +// AMDGCN30-GVAR-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN30-GVAR-NEXT: call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR4]] +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelLargeTwoMember( +// AMDGCN30-GVAR-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false) +// AMDGCN30-GVAR-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR4]] +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-LABEL: define dso_local %struct.Mat4X4 @foo( +// AMDGCN30-SAME: [9 x i32] [[IN_COERCE:%.*]]) #[[ATTR0:[0-9]+]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) +// AMDGCN30-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5) +// AMDGCN30-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(5) [[IN]], i32 0, i32 0 +// AMDGCN30-NEXT: store [9 x i32] [[IN_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 4 +// AMDGCN30-NEXT: [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr addrspace(5) [[RETVAL]], align 4 +// AMDGCN30-NEXT: ret [[STRUCT_MAT4X4]] [[TMP0]] +// +// +// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @ker( +// AMDGCN30-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-NEXT: call void @__clang_ocl_kern_imp_ker(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR4:[0-9]+]] +// AMDGCN30-NEXT: ret void +// +// +// AMDGCN30-LABEL: define dso_local void @__clang_ocl_kern_imp_ker( +// AMDGCN30-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) +// AMDGCN30-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i64 0 +// AMDGCN30-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1 +// AMDGCN30-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0 +// AMDGCN30-NEXT: [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4 +// AMDGCN30-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR4]] +// AMDGCN30-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0 +// AMDGCN30-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0 +// AMDGCN30-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4 +// AMDGCN30-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false) +// AMDGCN30-NEXT: ret void +// +// +// AMDGCN30-LABEL: define dso_local void @foo_large( +// AMDGCN30-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5) +// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false) +// AMDGCN30-NEXT: ret void +// +// +// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @ker_large( +// AMDGCN30-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-NEXT: call void @__clang_ocl_kern_imp_ker_large(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR4]] +// AMDGCN30-NEXT: ret void +// +// +// AMDGCN30-LABEL: define dso_local void @__clang_ocl_kern_imp_ker_large( +// AMDGCN30-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) +// AMDGCN30-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) +// AMDGCN30-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i64 0 +// AMDGCN30-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1 +// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false) +// AMDGCN30-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR4]] +// AMDGCN30-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false) +// AMDGCN30-NEXT: ret void +// +// +// AMDGCN30-LABEL: define dso_local void @FuncOneMember( +// AMDGCN30-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR0]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN30-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 +// AMDGCN30-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-NEXT: store <2 x i32> [[TMP0]], ptr addrspace(5) [[X]], align 8 +// AMDGCN30-NEXT: ret void +// +// +// AMDGCN30-LABEL: define dso_local void @FuncOneLargeMember( +// AMDGCN30-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN30-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) +// AMDGCN30-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr addrspace(5) [[X]], i64 0, i64 0 +// AMDGCN30-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 +// AMDGCN30-NEXT: ret void +// +// +// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local( +// AMDGCN30-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: call void @__clang_ocl_kern_imp_test_indirect_arg_local() #[[ATTR4]] +// AMDGCN30-NEXT: ret void +// +// +// AMDGCN30-LABEL: define dso_local void @__clang_ocl_kern_imp_test_indirect_arg_local( +// AMDGCN30-SAME: ) #[[ATTR2]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) +// AMDGCN30-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] +// AMDGCN30-NEXT: ret void +// +// +// AMDGCN30-LABEL: define dso_local void @test_indirect_arg_private( +// AMDGCN30-SAME: ) #[[ATTR0]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR4]] +// AMDGCN30-NEXT: ret void +// +// +// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelOneMember( +// AMDGCN30-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 +// AMDGCN30-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8 +// AMDGCN30-NEXT: call void @__clang_ocl_kern_imp_KernelOneMember(<2 x i32> [[TMP0]]) #[[ATTR4]] +// AMDGCN30-NEXT: ret void +// +// +// AMDGCN30-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelOneMember( +// AMDGCN30-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 +// AMDGCN30-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8 +// AMDGCN30-NEXT: call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR4]] +// AMDGCN30-NEXT: ret void +// +// +// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir( +// AMDGCN30-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-NEXT: store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8 +// AMDGCN30-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8 +// AMDGCN30-NEXT: call void @__clang_ocl_kern_imp_KernelOneMemberSpir(ptr addrspace(1) noundef align 8 [[TMP0]]) #[[ATTR4]] +// AMDGCN30-NEXT: ret void +// +// +// AMDGCN30-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelOneMemberSpir( +// AMDGCN30-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META14]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-NEXT: store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8 +// AMDGCN30-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8 +// AMDGCN30-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER:%.*]], ptr addrspace(1) [[TMP0]], i32 0, i32 0 +// AMDGCN30-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[COERCE_DIVE]], align 8 +// AMDGCN30-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR4]] +// AMDGCN30-NEXT: ret void +// +// +// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember( +// AMDGCN30-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN30-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 +// AMDGCN30-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN30-NEXT: call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR4]] +// AMDGCN30-NEXT: ret void +// +// +// AMDGCN30-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelLargeOneMember( +// AMDGCN30-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) +// AMDGCN30-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR4]] +// AMDGCN30-NEXT: ret void // -// AMDGCN30-GVAR-LABEL: define dso_local void @FuncTwoMember( -// AMDGCN30-GVAR-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR0]] { -// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] -// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN30-GVAR-NEXT: store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN30-GVAR-NEXT: store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8 -// AMDGCN30-GVAR-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN30-GVAR-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(5) [[Y]], align 8 -// AMDGCN30-GVAR-NEXT: ret void // // AMDGCN30-LABEL: define dso_local void @FuncTwoMember( // AMDGCN30-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR0]] { @@ -1060,74 +1733,6 @@ kernel void KernelLargeOneMember(struct LargeStructOneMember u) { // AMDGCN30-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(5) [[Y]], align 8 // AMDGCN30-NEXT: ret void // -void FuncTwoMember(struct StructTwoMember u) { - u.y = (int2)(0, 0); -} - -// X86-LABEL: define void @FuncLargeTwoMember( -// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { -// X86-NEXT: [[ENTRY:.*:]] -// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 -// X86-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 -// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 480, i1 false) -// X86-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 -// X86-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 -// X86-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1 -// X86-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr [[Y]], i32 0, i32 0 -// X86-NEXT: store <2 x i32> [[TMP1]], ptr [[ARRAYIDX]], align 8 -// X86-NEXT: ret void -// -// AMDGCN-LABEL: define dso_local void @FuncLargeTwoMember( -// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { -// AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) -// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false) -// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr addrspace(5) [[Y]], i64 0, i64 0 -// AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 -// AMDGCN-NEXT: ret void -// -// AMDGCN20-LABEL: define dso_local void @FuncLargeTwoMember( -// AMDGCN20-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { -// AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) -// AMDGCN20-NEXT: [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr -// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr -// AMDGCN20-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false) -// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 -// AMDGCN20-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 -// AMDGCN20-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1 -// AMDGCN20-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr [[Y]], i64 0, i64 0 -// AMDGCN20-NEXT: store <2 x i32> [[TMP1]], ptr [[ARRAYIDX]], align 8 -// AMDGCN20-NEXT: ret void -// -// SPIR-LABEL: define dso_local spir_func void @FuncLargeTwoMember( -// SPIR-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR0]] { -// SPIR-NEXT: [[ENTRY:.*:]] -// SPIR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 -// SPIR-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 -// SPIR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 -// SPIR-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1 -// SPIR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr [[Y]], i32 0, i32 0 -// SPIR-NEXT: store <2 x i32> [[TMP0]], ptr [[ARRAYIDX]], align 8 -// SPIR-NEXT: ret void -// -// AMDGCN30-GVAR-LABEL: define dso_local void @FuncLargeTwoMember( -// AMDGCN30-GVAR-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { -// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] -// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false) -// AMDGCN30-GVAR-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN30-GVAR-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN30-GVAR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr addrspace(5) [[Y]], i64 0, i64 0 -// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 -// AMDGCN30-GVAR-NEXT: ret void // // AMDGCN30-LABEL: define dso_local void @FuncLargeTwoMember( // AMDGCN30-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { @@ -1142,106 +1747,10 @@ void FuncTwoMember(struct StructTwoMember u) { // AMDGCN30-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 // AMDGCN30-NEXT: ret void // -void FuncLargeTwoMember(struct LargeStructTwoMember u) { - u.y[0] = (int2)(0, 0); -} - -// X86-LABEL: define spir_kernel void @KernelTwoMember( -// X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { -// X86-NEXT: [[ENTRY:.*:]] -// X86-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8 -// X86-NEXT: [[U1:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8 -// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[U1]], ptr align 1 [[U]], i64 16, i1 false) -// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I]], ptr align 4 [[U1]], i32 16, i1 false) -// X86-NEXT: call void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U_I]]) #[[ATTR4]] -// X86-NEXT: ret void -// -// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember( -// AMDGCN-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { -// AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0 -// AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 -// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1 -// AMDGCN-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 -// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8 -// AMDGCN-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8 -// AMDGCN-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(5) [[U_I]], align 8 -// AMDGCN-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U_I]], i32 0, i32 1 -// AMDGCN-NEXT: store <2 x i32> [[TMP7]], ptr addrspace(5) [[TMP8]], align 8 -// AMDGCN-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr addrspace(5) [[U_I]], align 8 -// AMDGCN-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U_I]], i32 0, i32 1 -// AMDGCN-NEXT: [[TMP11:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP10]], align 8 -// AMDGCN-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP9]], <2 x i32> [[TMP11]]) #[[ATTR4]] -// AMDGCN-NEXT: ret void -// -// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember( -// AMDGCN20-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { -// AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN20-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr -// AMDGCN20-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 -// AMDGCN20-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0 -// AMDGCN20-NEXT: store <2 x i32> [[TMP1]], ptr [[TMP0]], align 8 -// AMDGCN20-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 -// AMDGCN20-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1 -// AMDGCN20-NEXT: store <2 x i32> [[TMP3]], ptr [[TMP2]], align 8 -// AMDGCN20-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 -// AMDGCN20-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr [[TMP4]], align 8 -// AMDGCN20-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 -// AMDGCN20-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8 -// AMDGCN20-NEXT: [[U1_I:%.*]] = addrspacecast ptr addrspace(5) [[U_I]] to ptr -// AMDGCN20-NEXT: store <2 x i32> [[TMP5]], ptr [[U1_I]], align 8 -// AMDGCN20-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1_I]], i32 0, i32 1 -// AMDGCN20-NEXT: store <2 x i32> [[TMP7]], ptr [[TMP8]], align 8 -// AMDGCN20-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr [[U1_I]], align 8 -// AMDGCN20-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1_I]], i32 0, i32 1 -// AMDGCN20-NEXT: [[TMP11:%.*]] = load <2 x i32>, ptr [[TMP10]], align 8 -// AMDGCN20-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP9]], <2 x i32> [[TMP11]]) #[[ATTR4]] -// AMDGCN20-NEXT: ret void -// -// SPIR-LABEL: define dso_local spir_kernel void @KernelTwoMember( -// SPIR-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META12]] { -// SPIR-NEXT: [[ENTRY:.*:]] -// SPIR-NEXT: [[U1:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8 -// SPIR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[U1]], ptr align 1 [[U]], i64 16, i1 false) -// SPIR-NEXT: call spir_func void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 8 [[U1]]) #[[ATTR4]] -// SPIR-NEXT: ret void -// -// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember( -// AMDGCN30-GVAR-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { -// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] -// AMDGCN30-GVAR-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0 -// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN30-GVAR-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1 -// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN30-GVAR-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN30-GVAR-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8 -// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(5) [[U_I]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U_I]], i32 0, i32 1 -// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP7]], ptr addrspace(5) [[TMP8]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr addrspace(5) [[U_I]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U_I]], i32 0, i32 1 -// AMDGCN30-GVAR-NEXT: [[TMP11:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP10]], align 8 -// AMDGCN30-GVAR-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP9]], <2 x i32> [[TMP11]]) #[[ATTR4]] -// AMDGCN30-GVAR-NEXT: ret void // // AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember( // AMDGCN30-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { // AMDGCN30-NEXT: [[ENTRY:.*:]] -// AMDGCN30-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN30-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 // AMDGCN30-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0 @@ -1253,91 +1762,29 @@ void FuncLargeTwoMember(struct LargeStructTwoMember u) { // AMDGCN30-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8 // AMDGCN30-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 // AMDGCN30-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8 -// AMDGCN30-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(5) [[U_I]], align 8 -// AMDGCN30-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U_I]], i32 0, i32 1 -// AMDGCN30-NEXT: store <2 x i32> [[TMP7]], ptr addrspace(5) [[TMP8]], align 8 -// AMDGCN30-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr addrspace(5) [[U_I]], align 8 -// AMDGCN30-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U_I]], i32 0, i32 1 -// AMDGCN30-NEXT: [[TMP11:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP10]], align 8 -// AMDGCN30-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP9]], <2 x i32> [[TMP11]]) #[[ATTR4]] +// AMDGCN30-NEXT: call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR4]] // AMDGCN30-NEXT: ret void // -kernel void KernelTwoMember(struct StructTwoMember u) { - FuncTwoMember(u); -} - -// X86-LABEL: define spir_kernel void @KernelLargeTwoMember( -// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { -// X86-NEXT: [[ENTRY:.*:]] -// X86-NEXT: [[U_I:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 -// X86-NEXT: [[U1:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 -// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[U1]], ptr align 1 [[U]], i64 480, i1 false) -// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I]], ptr align 4 [[U1]], i32 480, i1 false) -// X86-NEXT: call void @FuncLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[U_I]]) #[[ATTR4]] -// X86-NEXT: ret void -// -// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember( -// AMDGCN-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { -// AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[U_I:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0 -// AMDGCN-NEXT: store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 -// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1 -// AMDGCN-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 -// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U_I]], ptr addrspace(5) align 8 [[U]], i64 480, i1 false) -// AMDGCN-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U_I]]) #[[ATTR4]] -// AMDGCN-NEXT: ret void -// -// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember( -// AMDGCN20-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { -// AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[COERCE_I:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN20-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN20-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr -// AMDGCN20-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 -// AMDGCN20-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0 -// AMDGCN20-NEXT: store [40 x <2 x i32>] [[TMP1]], ptr [[TMP0]], align 8 -// AMDGCN20-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 -// AMDGCN20-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1 -// AMDGCN20-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr [[TMP2]], align 8 -// AMDGCN20-NEXT: [[U1_ASCAST:%.*]] = addrspacecast ptr [[U1]] to ptr addrspace(5) -// AMDGCN20-NEXT: [[U_I:%.*]] = addrspacecast ptr addrspace(5) [[COERCE_I]] to ptr -// AMDGCN20-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 8 [[U_I]], ptr addrspace(5) align 8 [[U1_ASCAST]], i64 480, i1 false) -// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP_I]], ptr align 8 [[U_I]], i64 480, i1 false) -// AMDGCN20-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[BYVAL_TEMP_I]]) #[[ATTR4]] -// AMDGCN20-NEXT: ret void // -// SPIR-LABEL: define dso_local spir_kernel void @KernelLargeTwoMember( -// SPIR-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META12]] { -// SPIR-NEXT: [[ENTRY:.*:]] -// SPIR-NEXT: [[U1:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 -// SPIR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[U1]], ptr align 1 [[U]], i64 480, i1 false) -// SPIR-NEXT: call spir_func void @FuncLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U1]]) #[[ATTR4]] -// SPIR-NEXT: ret void +// AMDGCN30-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelTwoMember( +// AMDGCN30-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-NEXT: store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN30-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN30-NEXT: store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8 +// AMDGCN30-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN30-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN30-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8 +// AMDGCN30-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP3]], <2 x i32> [[TMP5]]) #[[ATTR4]] +// AMDGCN30-NEXT: ret void // -// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember( -// AMDGCN30-GVAR-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { -// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] -// AMDGCN30-GVAR-NEXT: [[U_I:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0 -// AMDGCN30-GVAR-NEXT: store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN30-GVAR-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1 -// AMDGCN30-GVAR-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 -// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U_I]], ptr addrspace(5) align 8 [[U]], i64 480, i1 false) -// AMDGCN30-GVAR-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U_I]]) #[[ATTR4]] -// AMDGCN30-GVAR-NEXT: ret void // // AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember( // AMDGCN30-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { // AMDGCN30-NEXT: [[ENTRY:.*:]] -// AMDGCN30-NEXT: [[U_I:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN30-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 // AMDGCN30-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0 @@ -1345,14 +1792,18 @@ kernel void KernelTwoMember(struct StructTwoMember u) { // AMDGCN30-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 // AMDGCN30-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1 // AMDGCN30-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 -// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U_I]], ptr addrspace(5) align 8 [[U]], i64 480, i1 false) -// AMDGCN30-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U_I]]) #[[ATTR4]] +// AMDGCN30-NEXT: call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR4]] +// AMDGCN30-NEXT: ret void +// +// +// AMDGCN30-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelLargeTwoMember( +// AMDGCN30-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false) +// AMDGCN30-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR4]] // AMDGCN30-NEXT: ret void // -kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { - FuncLargeTwoMember(u); -} - //. // X86: [[META4]] = !{i32 1, i32 1} // X86: [[META5]] = !{!"none", !"none"} diff --git a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl index 3331559e3e05b..6dc488c40da7f 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl @@ -1,4 +1,4 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 --include-generated-funcs // RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL2.0 -O0 -triple amdgcn | FileCheck -check-prefix=AMDGCN %s typedef int int2 __attribute__((ext_vector_type(2))); @@ -42,6 +42,78 @@ struct LargeStructOneMember g_s; #endif +Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) { + Mat4X4 out; + return out; +} + +// Expect two mem copies: one for the argument "in", and one for +// the return value. + +kernel void ker(global Mat3X3 *in, global Mat4X4 *out) { + out[0] = foo(in[1]); +} + +Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) { + Mat64X64 out; + return out; +} + +kernel void ker_large(global Mat32X32 *in, global Mat64X64 *out) { + out[0] = foo_large(in[1]); +} + +void FuncOneMember(struct StructOneMember u) { + u.x = (int2)(0, 0); +} + +void FuncOneLargeMember(struct LargeStructOneMember u) { + u.x[0] = (int2)(0, 0); +} + +#if (__OPENCL_C_VERSION__ == 200) || (__OPENCL_C_VERSION__ >= 300 && defined(__opencl_c_program_scope_global_variables)) +void test_indirect_arg_globl(void) { + FuncOneLargeMember(g_s); +} +#endif + +kernel void test_indirect_arg_local(void) { + local struct LargeStructOneMember l_s; + FuncOneLargeMember(l_s); +} + +void test_indirect_arg_private(void) { + struct LargeStructOneMember p_s; + FuncOneLargeMember(p_s); +} + +kernel void KernelOneMember(struct StructOneMember u) { + FuncOneMember(u); +} + +kernel void KernelOneMemberSpir(global struct StructOneMember* u) { + FuncOneMember(*u); +} + +kernel void KernelLargeOneMember(struct LargeStructOneMember u) { + FuncOneLargeMember(u); +} + +void FuncTwoMember(struct StructTwoMember u) { + u.y = (int2)(0, 0); +} + +void FuncLargeTwoMember(struct LargeStructTwoMember u) { + u.y[0] = (int2)(0, 0); +} + +kernel void KernelTwoMember(struct StructTwoMember u) { + FuncTwoMember(u); +} + +kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { + FuncLargeTwoMember(u); +} // AMDGCN-LABEL: define dso_local %struct.Mat4X4 @foo( // AMDGCN-SAME: [9 x i32] [[IN_COERCE:%.*]]) #[[ATTR0:[0-9]+]] { // AMDGCN-NEXT: [[ENTRY:.*:]] @@ -54,20 +126,10 @@ struct LargeStructOneMember g_s; // AMDGCN-NEXT: [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr [[RETVAL_ASCAST]], align 4 // AMDGCN-NEXT: ret [[STRUCT_MAT4X4]] [[TMP0]] // -Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) { - Mat4X4 out; - return out; -} - -// Expect two mem copies: one for the argument "in", and one for -// the return value. - +// // AMDGCN-LABEL: define dso_local amdgpu_kernel void @ker( // AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { // AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) // AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr @@ -76,24 +138,34 @@ Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) { // AMDGCN-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 // AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8 // AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 -// AMDGCN-NEXT: [[IN_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR_I]] to ptr -// AMDGCN-NEXT: [[OUT_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR_I]] to ptr -// AMDGCN-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[IN_ADDR_ASCAST_I]], align 8 -// AMDGCN-NEXT: store ptr addrspace(1) [[TMP1]], ptr [[OUT_ADDR_ASCAST_I]], align 8 -// AMDGCN-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST_I]], align 8 -// AMDGCN-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST_I]], align 8 -// AMDGCN-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP3]], i64 1 -// AMDGCN-NEXT: [[TMP4:%.*]] = load [9 x i32], ptr addrspace(1) [[ARRAYIDX1_I]], align 4 -// AMDGCN-NEXT: [[CALL_I:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP4]]) #[[ATTR4:[0-9]+]] -// AMDGCN-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL_I]], 0 -// AMDGCN-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP_I]], align 4 -// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[TMP2]], ptr addrspace(5) align 4 [[TMP_I]], i64 64, i1 false) +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ker(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR4:[0-9]+]] // AMDGCN-NEXT: ret void // -kernel void ker(global Mat3X3 *in, global Mat4X4 *out) { - out[0] = foo(in[1]); -} - +// +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_ker( +// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr +// AMDGCN-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// AMDGCN-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i64 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1 +// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4 +// AMDGCN-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR4]] +// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0 +// AMDGCN-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4 +// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false) +// AMDGCN-NEXT: ret void +// +// // AMDGCN-LABEL: define dso_local void @foo_large( // AMDGCN-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { // AMDGCN-NEXT: [[ENTRY:.*:]] @@ -102,18 +174,10 @@ kernel void ker(global Mat3X3 *in, global Mat4X4 *out) { // AMDGCN-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false) // AMDGCN-NEXT: ret void // -Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) { - Mat64X64 out; - return out; -} - +// // AMDGCN-LABEL: define dso_local amdgpu_kernel void @ker_large( // AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { // AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) -// AMDGCN-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) // AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr @@ -122,22 +186,31 @@ Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) { // AMDGCN-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 // AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8 // AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 -// AMDGCN-NEXT: [[IN_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR_I]] to ptr -// AMDGCN-NEXT: [[OUT_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR_I]] to ptr -// AMDGCN-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[IN_ADDR_ASCAST_I]], align 8 -// AMDGCN-NEXT: store ptr addrspace(1) [[TMP1]], ptr [[OUT_ADDR_ASCAST_I]], align 8 -// AMDGCN-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST_I]], align 8 -// AMDGCN-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST_I]], align 8 -// AMDGCN-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP3]], i64 1 -// AMDGCN-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 4 [[ARRAYIDX1_I]], i64 4096, i1 false) -// AMDGCN-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_I]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR4]] -// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[TMP2]], ptr addrspace(5) align 4 [[TMP_I]], i64 16384, i1 false) +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ker_large(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // -kernel void ker_large(global Mat32X32 *in, global Mat64X64 *out) { - out[0] = foo_large(in[1]); -} - +// +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_ker_large( +// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr +// AMDGCN-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// AMDGCN-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i64 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1 +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false) +// AMDGCN-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR4]] +// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false) +// AMDGCN-NEXT: ret void +// +// // AMDGCN-LABEL: define dso_local void @FuncOneMember( // AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR0]] { // AMDGCN-NEXT: [[ENTRY:.*:]] @@ -153,10 +226,7 @@ kernel void ker_large(global Mat32X32 *in, global Mat64X64 *out) { // AMDGCN-NEXT: store <2 x i32> [[TMP0]], ptr [[X]], align 8 // AMDGCN-NEXT: ret void // -void FuncOneMember(struct StructOneMember u) { - u.x = (int2)(0, 0); -} - +// // AMDGCN-LABEL: define dso_local void @FuncOneLargeMember( // AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { // AMDGCN-NEXT: [[ENTRY:.*:]] @@ -172,11 +242,7 @@ void FuncOneMember(struct StructOneMember u) { // AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr [[ARRAYIDX]], align 8 // AMDGCN-NEXT: ret void // -void FuncOneLargeMember(struct LargeStructOneMember u) { - u.x[0] = (int2)(0, 0); -} - -#if (__OPENCL_C_VERSION__ == 200) || (__OPENCL_C_VERSION__ >= 300 && defined(__opencl_c_program_scope_global_variables)) +// // AMDGCN-LABEL: define dso_local void @test_indirect_arg_globl( // AMDGCN-SAME: ) #[[ATTR0]] { // AMDGCN-NEXT: [[ENTRY:.*:]] @@ -185,24 +251,23 @@ void FuncOneLargeMember(struct LargeStructOneMember u) { // AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // -void test_indirect_arg_globl(void) { - FuncOneLargeMember(g_s); -} -#endif - +// // AMDGCN-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local( // AMDGCN-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { // AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP_I]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) -// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP_I]]) #[[ATTR4]] +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_test_indirect_arg_local() #[[ATTR4]] // AMDGCN-NEXT: ret void // -kernel void test_indirect_arg_local(void) { - local struct LargeStructOneMember l_s; - FuncOneLargeMember(l_s); -} - +// +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_test_indirect_arg_local( +// AMDGCN-SAME: ) #[[ATTR2]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) +// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// // AMDGCN-LABEL: define dso_local void @test_indirect_arg_private( // AMDGCN-SAME: ) #[[ATTR0]] { // AMDGCN-NEXT: [[ENTRY:.*:]] @@ -213,71 +278,82 @@ kernel void test_indirect_arg_local(void) { // AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // -void test_indirect_arg_private(void) { - struct LargeStructOneMember p_s; - FuncOneLargeMember(p_s); -} - +// // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMember( // AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { // AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) // AMDGCN-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr // AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 // AMDGCN-NEXT: store <2 x i32> [[U_COERCE]], ptr [[COERCE_DIVE]], align 8 // AMDGCN-NEXT: [[COERCE_DIVE2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 // AMDGCN-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[COERCE_DIVE2]], align 8 -// AMDGCN-NEXT: [[U1_I:%.*]] = addrspacecast ptr addrspace(5) [[U_I]] to ptr -// AMDGCN-NEXT: store <2 x i32> [[TMP0]], ptr [[U1_I]], align 8 -// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[U1_I]], align 8 -// AMDGCN-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR4]] +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelOneMember(<2 x i32> [[TMP0]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // -kernel void KernelOneMember(struct StructOneMember u) { - FuncOneMember(u); -} - +// +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelOneMember( +// AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr +// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE]], ptr [[COERCE_DIVE]], align 8 +// AMDGCN-NEXT: [[COERCE_DIVE2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[COERCE_DIVE2]], align 8 +// AMDGCN-NEXT: call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir( // AMDGCN-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { // AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[U_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: [[U_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[U_ADDR]] to ptr // AMDGCN-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR_ASCAST]], align 8 // AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR_ASCAST]], align 8 -// AMDGCN-NEXT: [[U_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[U_ADDR_I]] to ptr -// AMDGCN-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[U_ADDR_ASCAST_I]], align 8 -// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[U_ADDR_ASCAST_I]], align 8 -// AMDGCN-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(1) [[TMP1]], align 8 -// AMDGCN-NEXT: call void @FuncOneMember(<2 x i32> [[TMP2]]) #[[ATTR4]] +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelOneMemberSpir(ptr addrspace(1) noundef align 8 [[TMP0]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // -kernel void KernelOneMemberSpir(global struct StructOneMember* u) { - FuncOneMember(*u); -} - +// +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelOneMemberSpir( +// AMDGCN-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META14]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[U_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[U_ADDR]] to ptr +// AMDGCN-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER:%.*]], ptr addrspace(1) [[TMP0]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[COERCE_DIVE]], align 8 +// AMDGCN-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember( // AMDGCN-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { // AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[COERCE_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr // AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 // AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 // AMDGCN-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr [[TMP0]], align 8 // AMDGCN-NEXT: [[U1_ASCAST:%.*]] = addrspacecast ptr [[U1]] to ptr addrspace(5) -// AMDGCN-NEXT: [[U_I:%.*]] = addrspacecast ptr addrspace(5) [[COERCE_I]] to ptr -// AMDGCN-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 8 [[U_I]], ptr addrspace(5) align 8 [[U1_ASCAST]], i64 800, i1 false) -// AMDGCN-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP_I]], ptr align 8 [[U_I]], i64 800, i1 false) -// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP_I]]) #[[ATTR4]] +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U1_ASCAST]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // -kernel void KernelLargeOneMember(struct LargeStructOneMember u) { - FuncOneLargeMember(u); -} - +// +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelLargeOneMember( +// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr +// AMDGCN-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr align 8 [[U]], i64 800, i1 false) +// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// // AMDGCN-LABEL: define dso_local void @FuncTwoMember( // AMDGCN-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR0]] { // AMDGCN-NEXT: [[ENTRY:.*:]] @@ -295,10 +371,7 @@ kernel void KernelLargeOneMember(struct LargeStructOneMember u) { // AMDGCN-NEXT: store <2 x i32> [[TMP2]], ptr [[Y]], align 8 // AMDGCN-NEXT: ret void // -void FuncTwoMember(struct StructTwoMember u) { - u.y = (int2)(0, 0); -} - +// // AMDGCN-LABEL: define dso_local void @FuncLargeTwoMember( // AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { // AMDGCN-NEXT: [[ENTRY:.*:]] @@ -314,14 +387,10 @@ void FuncTwoMember(struct StructTwoMember u) { // AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr [[ARRAYIDX]], align 8 // AMDGCN-NEXT: ret void // -void FuncLargeTwoMember(struct LargeStructTwoMember u) { - u.y[0] = (int2)(0, 0); -} - +// // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember( // AMDGCN-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { // AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr // AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 @@ -334,25 +403,30 @@ void FuncLargeTwoMember(struct LargeStructTwoMember u) { // AMDGCN-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr [[TMP4]], align 8 // AMDGCN-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 // AMDGCN-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8 -// AMDGCN-NEXT: [[U1_I:%.*]] = addrspacecast ptr addrspace(5) [[U_I]] to ptr -// AMDGCN-NEXT: store <2 x i32> [[TMP5]], ptr [[U1_I]], align 8 -// AMDGCN-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1_I]], i32 0, i32 1 -// AMDGCN-NEXT: store <2 x i32> [[TMP7]], ptr [[TMP8]], align 8 -// AMDGCN-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr [[U1_I]], align 8 -// AMDGCN-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1_I]], i32 0, i32 1 -// AMDGCN-NEXT: [[TMP11:%.*]] = load <2 x i32>, ptr [[TMP10]], align 8 -// AMDGCN-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP9]], <2 x i32> [[TMP11]]) #[[ATTR4]] +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // -kernel void KernelTwoMember(struct StructTwoMember u) { - FuncTwoMember(u); -} - +// +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelTwoMember( +// AMDGCN-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr +// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE0]], ptr [[TMP0]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE1]], ptr [[TMP1]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[TMP2]], align 8 +// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr [[TMP4]], align 8 +// AMDGCN-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP3]], <2 x i32> [[TMP5]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember( // AMDGCN-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { // AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[COERCE_I:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr // AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 @@ -362,15 +436,21 @@ kernel void KernelTwoMember(struct StructTwoMember u) { // AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1 // AMDGCN-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr [[TMP2]], align 8 // AMDGCN-NEXT: [[U1_ASCAST:%.*]] = addrspacecast ptr [[U1]] to ptr addrspace(5) -// AMDGCN-NEXT: [[U_I:%.*]] = addrspacecast ptr addrspace(5) [[COERCE_I]] to ptr -// AMDGCN-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 8 [[U_I]], ptr addrspace(5) align 8 [[U1_ASCAST]], i64 480, i1 false) -// AMDGCN-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP_I]], ptr align 8 [[U_I]], i64 480, i1 false) -// AMDGCN-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[BYVAL_TEMP_I]]) #[[ATTR4]] +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U1_ASCAST]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelLargeTwoMember( +// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr +// AMDGCN-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr align 8 [[U]], i64 480, i1 false) +// AMDGCN-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // -kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { - FuncLargeTwoMember(u); -} //. // AMDGCN: [[META4]] = !{i32 1, i32 1} // AMDGCN: [[META5]] = !{!"none", !"none"} diff --git a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl index 1f62c6cce9a3c..773daf53b2746 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl @@ -1,4 +1,4 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --include-generated-funcs --prefix-filecheck-ir-name VAR +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-attributes --check-globals all --include-generated-funcs --prefix-filecheck-ir-name VAR --version 5 // RUN: %clang_cc1 -cl-std=CL2.0 -O0 -disable-llvm-passes -fno-ident -emit-llvm -o - -triple amdgcn-amd-amdhsa %s -fdenormal-fp-math-f32=preserve-sign | FileCheck %s --check-prefixes=CHECK,NOCPU // // Check no-optnone and target-cpu behavior @@ -70,9 +70,9 @@ kernel void test_target_features_kernel(global int *i) { // CHECK: @llvm.used = appending addrspace(1) global [10 x ptr] [ptr @__test_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle to ptr), ptr @__test_block_invoke_2_kernel, ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle to ptr), ptr @__test_block_invoke_3_kernel, ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to ptr), ptr @__test_block_invoke_4_kernel, ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle to ptr), ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle to ptr)], section "llvm.metadata" //. // NOCPU: Function Attrs: convergent noinline norecurse nounwind optnone -// NOCPU-LABEL: define {{[^@]+}}@callee -// NOCPU-SAME: (i64 noundef [[ID:%.*]], ptr addrspace(1) noundef [[OUT:%.*]]) #[[ATTR1:[0-9]+]] { -// NOCPU-NEXT: entry: +// NOCPU-LABEL: define dso_local void @callee( +// NOCPU-SAME: i64 noundef [[ID:%.*]], ptr addrspace(1) noundef [[OUT:%.*]]) #[[ATTR1:[0-9]+]] { +// NOCPU-NEXT: [[ENTRY:.*:]] // NOCPU-NEXT: [[ID_ADDR:%.*]] = alloca i64, align 8, addrspace(5) // NOCPU-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // NOCPU-NEXT: [[ID_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ID_ADDR]] to ptr @@ -88,9 +88,9 @@ kernel void test_target_features_kernel(global int *i) { // // // NOCPU: Function Attrs: convergent noinline norecurse nounwind optnone -// NOCPU-LABEL: define {{[^@]+}}@test -// NOCPU-SAME: (ptr addrspace(1) noundef align 1 [[A:%.*]], i8 noundef [[B:%.*]], ptr addrspace(1) noundef align 8 [[C:%.*]], i64 noundef [[D:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META5]] !kernel_arg_type_qual [[META6:![0-9]+]] { -// NOCPU-NEXT: entry: +// NOCPU-LABEL: define dso_local amdgpu_kernel void @test( +// NOCPU-SAME: ptr addrspace(1) noundef align 1 [[A:%.*]], i8 noundef [[B:%.*]], ptr addrspace(1) noundef align 8 [[C:%.*]], i64 noundef [[D:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META5]] !kernel_arg_type_qual [[META6:![0-9]+]] { +// NOCPU-NEXT: [[ENTRY:.*:]] // NOCPU-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // NOCPU-NEXT: [[B_ADDR:%.*]] = alloca i8, align 1, addrspace(5) // NOCPU-NEXT: [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) @@ -111,10 +111,10 @@ kernel void test_target_features_kernel(global int *i) { // NOCPU-NEXT: ret void // // -// NOCPU: Function Attrs: alwaysinline convergent norecurse nounwind -// NOCPU-LABEL: define {{[^@]+}}@__clang_ocl_kern_imp_test -// NOCPU-SAME: (ptr addrspace(1) noundef align 1 [[A:%.*]], i8 noundef signext [[B:%.*]], ptr addrspace(1) noundef align 8 [[C:%.*]], i64 noundef [[D:%.*]]) #[[ATTR3:[0-9]+]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META5]] !kernel_arg_base_type [[META5]] !kernel_arg_type_qual [[META6]] { -// NOCPU-NEXT: entry: +// NOCPU: Function Attrs: convergent noinline norecurse nounwind optnone +// NOCPU-LABEL: define dso_local void @__clang_ocl_kern_imp_test( +// NOCPU-SAME: ptr addrspace(1) noundef align 1 [[A:%.*]], i8 noundef signext [[B:%.*]], ptr addrspace(1) noundef align 8 [[C:%.*]], i64 noundef [[D:%.*]]) #[[ATTR3:[0-9]+]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META5]] !kernel_arg_base_type [[META5]] !kernel_arg_type_qual [[META6]] { +// NOCPU-NEXT: [[ENTRY:.*:]] // NOCPU-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // NOCPU-NEXT: [[B_ADDR:%.*]] = alloca i8, align 1, addrspace(5) // NOCPU-NEXT: [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) @@ -238,9 +238,9 @@ kernel void test_target_features_kernel(global int *i) { // // // NOCPU: Function Attrs: convergent noinline norecurse nounwind optnone -// NOCPU-LABEL: define {{[^@]+}}@test_target_features_kernel -// NOCPU-SAME: (ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR4:[0-9]+]] !kernel_arg_addr_space [[META7:![0-9]+]] !kernel_arg_access_qual [[META8:![0-9]+]] !kernel_arg_type [[META9:![0-9]+]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META10:![0-9]+]] { -// NOCPU-NEXT: entry: +// NOCPU-LABEL: define dso_local amdgpu_kernel void @test_target_features_kernel( +// NOCPU-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR4:[0-9]+]] !kernel_arg_addr_space [[META7:![0-9]+]] !kernel_arg_access_qual [[META8:![0-9]+]] !kernel_arg_type [[META9:![0-9]+]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META10:![0-9]+]] { +// NOCPU-NEXT: [[ENTRY:.*:]] // NOCPU-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // NOCPU-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr // NOCPU-NEXT: store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align 8 @@ -249,10 +249,10 @@ kernel void test_target_features_kernel(global int *i) { // NOCPU-NEXT: ret void // // -// NOCPU: Function Attrs: alwaysinline convergent norecurse nounwind -// NOCPU-LABEL: define {{[^@]+}}@__clang_ocl_kern_imp_test_target_features_kernel -// NOCPU-SAME: (ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR5:[0-9]+]] !kernel_arg_addr_space [[META7]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META10]] { -// NOCPU-NEXT: entry: +// NOCPU: Function Attrs: convergent noinline norecurse nounwind optnone +// NOCPU-LABEL: define dso_local void @__clang_ocl_kern_imp_test_target_features_kernel( +// NOCPU-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR5:[0-9]+]] !kernel_arg_addr_space [[META7]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META10]] { +// NOCPU-NEXT: [[ENTRY:.*:]] // NOCPU-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // NOCPU-NEXT: [[DEFAULT_QUEUE:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // NOCPU-NEXT: [[FLAGS:%.*]] = alloca i32, align 4, addrspace(5) @@ -274,9 +274,9 @@ kernel void test_target_features_kernel(global int *i) { // // // NOCPU: Function Attrs: convergent noinline nounwind optnone -// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke -// NOCPU-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR7:[0-9]+]] { -// NOCPU-NEXT: entry: +// NOCPU-LABEL: define internal void @__test_block_invoke( +// NOCPU-SAME: ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR7:[0-9]+]] { +// NOCPU-NEXT: [[ENTRY:.*:]] // NOCPU-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // NOCPU-NEXT: [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // NOCPU-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr @@ -293,9 +293,9 @@ kernel void test_target_features_kernel(global int *i) { // // // NOCPU: Function Attrs: convergent nounwind -// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_kernel -// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0:%.*]]) #[[ATTR8:[0-9]+]] !associated [[META11:![0-9]+]] !kernel_arg_addr_space [[META12:![0-9]+]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META13]] !kernel_arg_type_qual [[META10]] { -// NOCPU-NEXT: entry: +// NOCPU-LABEL: define internal amdgpu_kernel void @__test_block_invoke_kernel( +// NOCPU-SAME: <{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0:%.*]]) #[[ATTR8:[0-9]+]] !associated [[META11:![0-9]+]] !kernel_arg_addr_space [[META12:![0-9]+]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META13]] !kernel_arg_type_qual [[META10]] { +// NOCPU-NEXT: [[ENTRY:.*:]] // NOCPU-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), i8 }>, align 8, addrspace(5) // NOCPU-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 // NOCPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr @@ -304,9 +304,9 @@ kernel void test_target_features_kernel(global int *i) { // // // NOCPU: Function Attrs: convergent noinline nounwind optnone -// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_2 -// NOCPU-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR7]] { -// NOCPU-NEXT: entry: +// NOCPU-LABEL: define internal void @__test_block_invoke_2( +// NOCPU-SAME: ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR7]] { +// NOCPU-NEXT: [[ENTRY:.*:]] // NOCPU-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // NOCPU-NEXT: [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // NOCPU-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr @@ -329,9 +329,9 @@ kernel void test_target_features_kernel(global int *i) { // // // NOCPU: Function Attrs: convergent nounwind -// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_2_kernel -// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]]) #[[ATTR8]] !associated [[META14:![0-9]+]] !kernel_arg_addr_space [[META12]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META13]] !kernel_arg_base_type [[META13]] !kernel_arg_type_qual [[META10]] { -// NOCPU-NEXT: entry: +// NOCPU-LABEL: define internal amdgpu_kernel void @__test_block_invoke_2_kernel( +// NOCPU-SAME: <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]]) #[[ATTR8]] !associated [[META14:![0-9]+]] !kernel_arg_addr_space [[META12]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META13]] !kernel_arg_base_type [[META13]] !kernel_arg_type_qual [[META10]] { +// NOCPU-NEXT: [[ENTRY:.*:]] // NOCPU-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5) // NOCPU-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 // NOCPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr @@ -340,9 +340,9 @@ kernel void test_target_features_kernel(global int *i) { // // // NOCPU: Function Attrs: convergent noinline nounwind optnone -// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_3 -// NOCPU-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]], ptr addrspace(3) noundef [[LP:%.*]]) #[[ATTR7]] { -// NOCPU-NEXT: entry: +// NOCPU-LABEL: define internal void @__test_block_invoke_3( +// NOCPU-SAME: ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]], ptr addrspace(3) noundef [[LP:%.*]]) #[[ATTR7]] { +// NOCPU-NEXT: [[ENTRY:.*:]] // NOCPU-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // NOCPU-NEXT: [[LP_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) // NOCPU-NEXT: [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) @@ -371,9 +371,9 @@ kernel void test_target_features_kernel(global int *i) { // // // NOCPU: Function Attrs: convergent nounwind -// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_3_kernel -// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) #[[ATTR8]] !associated [[META15:![0-9]+]] !kernel_arg_addr_space [[META16:![0-9]+]] !kernel_arg_access_qual [[META17:![0-9]+]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META19:![0-9]+]] { -// NOCPU-NEXT: entry: +// NOCPU-LABEL: define internal amdgpu_kernel void @__test_block_invoke_3_kernel( +// NOCPU-SAME: <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) #[[ATTR8]] !associated [[META15:![0-9]+]] !kernel_arg_addr_space [[META16:![0-9]+]] !kernel_arg_access_qual [[META17:![0-9]+]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META19:![0-9]+]] { +// NOCPU-NEXT: [[ENTRY:.*:]] // NOCPU-NEXT: [[TMP2:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5) // NOCPU-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP2]], align 8 // NOCPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(5) [[TMP2]] to ptr @@ -382,9 +382,9 @@ kernel void test_target_features_kernel(global int *i) { // // // NOCPU: Function Attrs: convergent noinline nounwind optnone -// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_4 -// NOCPU-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR7]] { -// NOCPU-NEXT: entry: +// NOCPU-LABEL: define internal void @__test_block_invoke_4( +// NOCPU-SAME: ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR7]] { +// NOCPU-NEXT: [[ENTRY:.*:]] // NOCPU-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // NOCPU-NEXT: [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // NOCPU-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr @@ -395,14 +395,14 @@ kernel void test_target_features_kernel(global int *i) { // NOCPU-NEXT: [[TMP0:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR]], align 8 // NOCPU-NEXT: [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4 // NOCPU-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8 -// NOCPU-NEXT: call void @callee(i64 noundef [[TMP0]], ptr addrspace(1) noundef [[TMP1]]) #[[ATTR11:[0-9]+]] +// NOCPU-NEXT: call void @callee(i64 noundef [[TMP0]], ptr addrspace(1) noundef [[TMP1]]) #[[ATTR10]] // NOCPU-NEXT: ret void // // // NOCPU: Function Attrs: convergent nounwind -// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_4_kernel -// NOCPU-SAME: (<{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0:%.*]]) #[[ATTR8]] !associated [[META20:![0-9]+]] !kernel_arg_addr_space [[META12]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META13]] !kernel_arg_base_type [[META13]] !kernel_arg_type_qual [[META10]] { -// NOCPU-NEXT: entry: +// NOCPU-LABEL: define internal amdgpu_kernel void @__test_block_invoke_4_kernel( +// NOCPU-SAME: <{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0:%.*]]) #[[ATTR8]] !associated [[META20:![0-9]+]] !kernel_arg_addr_space [[META12]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META13]] !kernel_arg_base_type [[META13]] !kernel_arg_type_qual [[META10]] { +// NOCPU-NEXT: [[ENTRY:.*:]] // NOCPU-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, i64, ptr addrspace(1) }>, align 8, addrspace(5) // NOCPU-NEXT: store <{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 // NOCPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr @@ -411,9 +411,9 @@ kernel void test_target_features_kernel(global int *i) { // // // NOCPU: Function Attrs: convergent noinline nounwind optnone -// NOCPU-LABEL: define {{[^@]+}}@__test_target_features_kernel_block_invoke -// NOCPU-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR7]] { -// NOCPU-NEXT: entry: +// NOCPU-LABEL: define internal void @__test_target_features_kernel_block_invoke( +// NOCPU-SAME: ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR7]] { +// NOCPU-NEXT: [[ENTRY:.*:]] // NOCPU-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // NOCPU-NEXT: [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // NOCPU-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr @@ -425,9 +425,9 @@ kernel void test_target_features_kernel(global int *i) { // // // NOCPU: Function Attrs: convergent nounwind -// NOCPU-LABEL: define {{[^@]+}}@__test_target_features_kernel_block_invoke_kernel -// NOCPU-SAME: ({ i32, i32, ptr } [[TMP0:%.*]]) #[[ATTR8]] !associated [[META21:![0-9]+]] !kernel_arg_addr_space [[META12]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META13]] !kernel_arg_base_type [[META13]] !kernel_arg_type_qual [[META10]] { -// NOCPU-NEXT: entry: +// NOCPU-LABEL: define internal amdgpu_kernel void @__test_target_features_kernel_block_invoke_kernel( +// NOCPU-SAME: { i32, i32, ptr } [[TMP0:%.*]]) #[[ATTR8]] !associated [[META21:![0-9]+]] !kernel_arg_addr_space [[META12]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META13]] !kernel_arg_base_type [[META13]] !kernel_arg_type_qual [[META10]] { +// NOCPU-NEXT: [[ENTRY:.*:]] // NOCPU-NEXT: [[TMP1:%.*]] = alloca { i32, i32, ptr }, align 8, addrspace(5) // NOCPU-NEXT: store { i32, i32, ptr } [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 // NOCPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr @@ -451,9 +451,9 @@ kernel void test_target_features_kernel(global int *i) { // // // GFX900: Function Attrs: convergent norecurse nounwind -// GFX900-LABEL: define {{[^@]+}}@callee -// GFX900-SAME: (i64 noundef [[ID:%.*]], ptr addrspace(1) noundef [[OUT:%.*]]) #[[ATTR1:[0-9]+]] { -// GFX900-NEXT: entry: +// GFX900-LABEL: define dso_local void @callee( +// GFX900-SAME: i64 noundef [[ID:%.*]], ptr addrspace(1) noundef [[OUT:%.*]]) #[[ATTR1:[0-9]+]] { +// GFX900-NEXT: [[ENTRY:.*:]] // GFX900-NEXT: [[ID_ADDR:%.*]] = alloca i64, align 8, addrspace(5) // GFX900-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // GFX900-NEXT: [[ID_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ID_ADDR]] to ptr @@ -469,9 +469,9 @@ kernel void test_target_features_kernel(global int *i) { // // // GFX900: Function Attrs: convergent norecurse nounwind -// GFX900-LABEL: define {{[^@]+}}@test -// GFX900-SAME: (ptr addrspace(1) noundef align 1 [[A:%.*]], i8 noundef [[B:%.*]], ptr addrspace(1) noundef align 8 [[C:%.*]], i64 noundef [[D:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { -// GFX900-NEXT: entry: +// GFX900-LABEL: define dso_local amdgpu_kernel void @test( +// GFX900-SAME: ptr addrspace(1) noundef align 1 [[A:%.*]], i8 noundef [[B:%.*]], ptr addrspace(1) noundef align 8 [[C:%.*]], i64 noundef [[D:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { +// GFX900-NEXT: [[ENTRY:.*:]] // GFX900-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // GFX900-NEXT: [[B_ADDR:%.*]] = alloca i8, align 1, addrspace(5) // GFX900-NEXT: [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) @@ -492,10 +492,10 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: ret void // // -// GFX900: Function Attrs: alwaysinline convergent norecurse nounwind -// GFX900-LABEL: define {{[^@]+}}@__clang_ocl_kern_imp_test -// GFX900-SAME: (ptr addrspace(1) noundef align 1 [[A:%.*]], i8 noundef signext [[B:%.*]], ptr addrspace(1) noundef align 8 [[C:%.*]], i64 noundef [[D:%.*]]) #[[ATTR3:[0-9]+]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13]] { -// GFX900-NEXT: entry: +// GFX900: Function Attrs: convergent norecurse nounwind +// GFX900-LABEL: define dso_local void @__clang_ocl_kern_imp_test( +// GFX900-SAME: ptr addrspace(1) noundef align 1 [[A:%.*]], i8 noundef signext [[B:%.*]], ptr addrspace(1) noundef align 8 [[C:%.*]], i64 noundef [[D:%.*]]) #[[ATTR3:[0-9]+]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13]] { +// GFX900-NEXT: [[ENTRY:.*:]] // GFX900-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // GFX900-NEXT: [[B_ADDR:%.*]] = alloca i8, align 1, addrspace(5) // GFX900-NEXT: [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) @@ -629,9 +629,9 @@ kernel void test_target_features_kernel(global int *i) { // // // GFX900: Function Attrs: convergent norecurse nounwind -// GFX900-LABEL: define {{[^@]+}}@test_target_features_kernel -// GFX900-SAME: (ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META22:![0-9]+]] !kernel_arg_access_qual [[META23:![0-9]+]] !kernel_arg_type [[META24:![0-9]+]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25:![0-9]+]] { -// GFX900-NEXT: entry: +// GFX900-LABEL: define dso_local amdgpu_kernel void @test_target_features_kernel( +// GFX900-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META22:![0-9]+]] !kernel_arg_access_qual [[META23:![0-9]+]] !kernel_arg_type [[META24:![0-9]+]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25:![0-9]+]] { +// GFX900-NEXT: [[ENTRY:.*:]] // GFX900-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // GFX900-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr // GFX900-NEXT: store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align 8, !tbaa [[TBAA26:![0-9]+]] @@ -640,10 +640,10 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: ret void // // -// GFX900: Function Attrs: alwaysinline convergent norecurse nounwind -// GFX900-LABEL: define {{[^@]+}}@__clang_ocl_kern_imp_test_target_features_kernel -// GFX900-SAME: (ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META22]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META24]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25]] { -// GFX900-NEXT: entry: +// GFX900: Function Attrs: convergent norecurse nounwind +// GFX900-LABEL: define dso_local void @__clang_ocl_kern_imp_test_target_features_kernel( +// GFX900-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META22]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META24]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25]] { +// GFX900-NEXT: [[ENTRY:.*:]] // GFX900-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // GFX900-NEXT: [[DEFAULT_QUEUE:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // GFX900-NEXT: [[FLAGS:%.*]] = alloca i32, align 4, addrspace(5) @@ -671,9 +671,9 @@ kernel void test_target_features_kernel(global int *i) { // // // GFX900: Function Attrs: convergent nounwind -// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke -// GFX900-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR6:[0-9]+]] { -// GFX900-NEXT: entry: +// GFX900-LABEL: define internal void @__test_block_invoke( +// GFX900-SAME: ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR6:[0-9]+]] { +// GFX900-NEXT: [[ENTRY:.*:]] // GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr // GFX900-NEXT: store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8 @@ -687,9 +687,9 @@ kernel void test_target_features_kernel(global int *i) { // // // GFX900: Function Attrs: convergent nounwind -// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_kernel -// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0:%.*]]) #[[ATTR6]] !associated [[META28:![0-9]+]] !kernel_arg_addr_space [[META29:![0-9]+]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META30:![0-9]+]] !kernel_arg_base_type [[META30]] !kernel_arg_type_qual [[META25]] { -// GFX900-NEXT: entry: +// GFX900-LABEL: define internal amdgpu_kernel void @__test_block_invoke_kernel( +// GFX900-SAME: <{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0:%.*]]) #[[ATTR6]] !associated [[META28:![0-9]+]] !kernel_arg_addr_space [[META29:![0-9]+]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META30:![0-9]+]] !kernel_arg_base_type [[META30]] !kernel_arg_type_qual [[META25]] { +// GFX900-NEXT: [[ENTRY:.*:]] // GFX900-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), i8 }>, align 8, addrspace(5) // GFX900-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 // GFX900-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr @@ -698,9 +698,9 @@ kernel void test_target_features_kernel(global int *i) { // // // GFX900: Function Attrs: convergent nounwind -// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_2 -// GFX900-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR6]] { -// GFX900-NEXT: entry: +// GFX900-LABEL: define internal void @__test_block_invoke_2( +// GFX900-SAME: ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR6]] { +// GFX900-NEXT: [[ENTRY:.*:]] // GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr // GFX900-NEXT: store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8 @@ -720,9 +720,9 @@ kernel void test_target_features_kernel(global int *i) { // // // GFX900: Function Attrs: convergent nounwind -// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_2_kernel -// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]]) #[[ATTR6]] !associated [[META31:![0-9]+]] !kernel_arg_addr_space [[META29]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META30]] !kernel_arg_base_type [[META30]] !kernel_arg_type_qual [[META25]] { -// GFX900-NEXT: entry: +// GFX900-LABEL: define internal amdgpu_kernel void @__test_block_invoke_2_kernel( +// GFX900-SAME: <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]]) #[[ATTR6]] !associated [[META31:![0-9]+]] !kernel_arg_addr_space [[META29]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META30]] !kernel_arg_base_type [[META30]] !kernel_arg_type_qual [[META25]] { +// GFX900-NEXT: [[ENTRY:.*:]] // GFX900-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5) // GFX900-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 // GFX900-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr @@ -731,9 +731,9 @@ kernel void test_target_features_kernel(global int *i) { // // // GFX900: Function Attrs: convergent nounwind -// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_3 -// GFX900-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]], ptr addrspace(3) noundef [[LP:%.*]]) #[[ATTR6]] { -// GFX900-NEXT: entry: +// GFX900-LABEL: define internal void @__test_block_invoke_3( +// GFX900-SAME: ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]], ptr addrspace(3) noundef [[LP:%.*]]) #[[ATTR6]] { +// GFX900-NEXT: [[ENTRY:.*:]] // GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // GFX900-NEXT: [[LP_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) // GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr @@ -759,9 +759,9 @@ kernel void test_target_features_kernel(global int *i) { // // // GFX900: Function Attrs: convergent nounwind -// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_3_kernel -// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) #[[ATTR6]] !associated [[META33:![0-9]+]] !kernel_arg_addr_space [[META34:![0-9]+]] !kernel_arg_access_qual [[META35:![0-9]+]] !kernel_arg_type [[META36:![0-9]+]] !kernel_arg_base_type [[META36]] !kernel_arg_type_qual [[META37:![0-9]+]] { -// GFX900-NEXT: entry: +// GFX900-LABEL: define internal amdgpu_kernel void @__test_block_invoke_3_kernel( +// GFX900-SAME: <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) #[[ATTR6]] !associated [[META33:![0-9]+]] !kernel_arg_addr_space [[META34:![0-9]+]] !kernel_arg_access_qual [[META35:![0-9]+]] !kernel_arg_type [[META36:![0-9]+]] !kernel_arg_base_type [[META36]] !kernel_arg_type_qual [[META37:![0-9]+]] { +// GFX900-NEXT: [[ENTRY:.*:]] // GFX900-NEXT: [[TMP2:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5) // GFX900-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP2]], align 8 // GFX900-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(5) [[TMP2]] to ptr @@ -770,9 +770,9 @@ kernel void test_target_features_kernel(global int *i) { // // // GFX900: Function Attrs: convergent nounwind -// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_4 -// GFX900-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR6]] { -// GFX900-NEXT: entry: +// GFX900-LABEL: define internal void @__test_block_invoke_4( +// GFX900-SAME: ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR6]] { +// GFX900-NEXT: [[ENTRY:.*:]] // GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr // GFX900-NEXT: store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8 @@ -780,14 +780,14 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: [[TMP0:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[TBAA3]] // GFX900-NEXT: [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4 // GFX900-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8, !tbaa [[TBAA7]] -// GFX900-NEXT: call void @callee(i64 noundef [[TMP0]], ptr addrspace(1) noundef [[TMP1]]) #[[ATTR10:[0-9]+]] +// GFX900-NEXT: call void @callee(i64 noundef [[TMP0]], ptr addrspace(1) noundef [[TMP1]]) #[[ATTR8]] // GFX900-NEXT: ret void // // // GFX900: Function Attrs: convergent nounwind -// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_4_kernel -// GFX900-SAME: (<{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0:%.*]]) #[[ATTR6]] !associated [[META38:![0-9]+]] !kernel_arg_addr_space [[META29]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META30]] !kernel_arg_base_type [[META30]] !kernel_arg_type_qual [[META25]] { -// GFX900-NEXT: entry: +// GFX900-LABEL: define internal amdgpu_kernel void @__test_block_invoke_4_kernel( +// GFX900-SAME: <{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0:%.*]]) #[[ATTR6]] !associated [[META38:![0-9]+]] !kernel_arg_addr_space [[META29]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META30]] !kernel_arg_base_type [[META30]] !kernel_arg_type_qual [[META25]] { +// GFX900-NEXT: [[ENTRY:.*:]] // GFX900-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, i64, ptr addrspace(1) }>, align 8, addrspace(5) // GFX900-NEXT: store <{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 // GFX900-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr @@ -796,9 +796,9 @@ kernel void test_target_features_kernel(global int *i) { // // // GFX900: Function Attrs: convergent nounwind -// GFX900-LABEL: define {{[^@]+}}@__test_target_features_kernel_block_invoke -// GFX900-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR6]] { -// GFX900-NEXT: entry: +// GFX900-LABEL: define internal void @__test_target_features_kernel_block_invoke( +// GFX900-SAME: ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR6]] { +// GFX900-NEXT: [[ENTRY:.*:]] // GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr // GFX900-NEXT: store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8 @@ -807,9 +807,9 @@ kernel void test_target_features_kernel(global int *i) { // // // GFX900: Function Attrs: convergent nounwind -// GFX900-LABEL: define {{[^@]+}}@__test_target_features_kernel_block_invoke_kernel -// GFX900-SAME: ({ i32, i32, ptr } [[TMP0:%.*]]) #[[ATTR6]] !associated [[META39:![0-9]+]] !kernel_arg_addr_space [[META29]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META30]] !kernel_arg_base_type [[META30]] !kernel_arg_type_qual [[META25]] { -// GFX900-NEXT: entry: +// GFX900-LABEL: define internal amdgpu_kernel void @__test_target_features_kernel_block_invoke_kernel( +// GFX900-SAME: { i32, i32, ptr } [[TMP0:%.*]]) #[[ATTR6]] !associated [[META39:![0-9]+]] !kernel_arg_addr_space [[META29]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META30]] !kernel_arg_base_type [[META30]] !kernel_arg_type_qual [[META25]] { +// GFX900-NEXT: [[ENTRY:.*:]] // GFX900-NEXT: [[TMP1:%.*]] = alloca { i32, i32, ptr }, align 8, addrspace(5) // GFX900-NEXT: store { i32, i32, ptr } [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 // GFX900-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr @@ -820,27 +820,25 @@ kernel void test_target_features_kernel(global int *i) { // NOCPU: attributes #[[ATTR0:[0-9]+]] = { "objc_arc_inert" } // NOCPU: attributes #[[ATTR1]] = { convergent noinline norecurse nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } // NOCPU: attributes #[[ATTR2]] = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" } -// NOCPU: attributes #[[ATTR3]] = { alwaysinline convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" } +// NOCPU: attributes #[[ATTR3]] = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } // NOCPU: attributes #[[ATTR4]] = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+s-memtime-inst" "uniform-work-group-size"="false" } -// NOCPU: attributes #[[ATTR5]] = { alwaysinline convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+s-memtime-inst" "uniform-work-group-size"="false" } +// NOCPU: attributes #[[ATTR5]] = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+s-memtime-inst" } // NOCPU: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } // NOCPU: attributes #[[ATTR7]] = { convergent noinline nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } // NOCPU: attributes #[[ATTR8]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } // NOCPU: attributes #[[ATTR9:[0-9]+]] = { nocallback nofree nosync nounwind willreturn } -// NOCPU: attributes #[[ATTR10]] = { convergent nounwind "uniform-work-group-size"="false" } -// NOCPU: attributes #[[ATTR11]] = { convergent nounwind } +// NOCPU: attributes #[[ATTR10]] = { convergent nounwind } //. // GFX900: attributes #[[ATTR0:[0-9]+]] = { "objc_arc_inert" } // GFX900: attributes #[[ATTR1]] = { convergent norecurse nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" } // GFX900: attributes #[[ATTR2]] = { convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" "uniform-work-group-size"="false" } -// GFX900: attributes #[[ATTR3]] = { alwaysinline convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" "uniform-work-group-size"="false" } +// GFX900: attributes #[[ATTR3]] = { convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" } // GFX900: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } // GFX900: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } // GFX900: attributes #[[ATTR6]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" } // GFX900: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind willreturn } -// GFX900: attributes #[[ATTR8]] = { convergent nounwind "uniform-work-group-size"="false" } +// GFX900: attributes #[[ATTR8]] = { convergent nounwind } // GFX900: attributes #[[ATTR9]] = { nounwind } -// GFX900: attributes #[[ATTR10]] = { convergent nounwind } //. // NOCPU: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} // NOCPU: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} diff --git a/clang/test/CodeGenOpenCL/cl-uniform-wg-size.cl b/clang/test/CodeGenOpenCL/cl-uniform-wg-size.cl index 365135a251a24..5f32231b18c3d 100644 --- a/clang/test/CodeGenOpenCL/cl-uniform-wg-size.cl +++ b/clang/test/CodeGenOpenCL/cl-uniform-wg-size.cl @@ -5,6 +5,9 @@ kernel void ker() {}; // CHECK: define{{.*}}@ker() #[[ATTR0:[0-9]+]] +// CHECK: call void @__clang_ocl_kern_imp_ker() #[[ATTR2:[0-9]+]] + +// CHECK: define{{.*}}@__clang_ocl_kern_imp_ker() #[[ATTR1:[0-9]+]] void foo() {}; // CHECK: define{{.*}}@foo() #[[ATTR1:[0-9]+]] @@ -15,3 +18,6 @@ void foo() {}; // CHECK: attributes #[[ATTR1]] // CHECK-NOT: uniform-work-group-size + +// CHECK: attributes #[[ATTR2]] +// CHECK-NOT: uniform-work-group-size diff --git a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl index f5c342d1243ae..ea1f734391614 100644 --- a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl +++ b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl @@ -1,4 +1,4 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --include-generated-funcs +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-attributes --check-globals all --include-generated-funcs --version 5 // RUN: %clang_cc1 -fno-ident -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir-unknown-unknown" -fdenormal-fp-math-f32=preserve-sign -cl-uniform-work-group-size | FileCheck --check-prefix=SPIR32 %s // RUN: %clang_cc1 -fno-ident -ffp-exception-behavior=strict -fexperimental-strict-floating-point -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir-unknown-unknown" | FileCheck --check-prefix=STRICTFP %s @@ -21,17 +21,9 @@ kernel void device_side_enqueue(global float *a, global float *b, int i) { }); } // SPIR32: Function Attrs: convergent noinline norecurse nounwind optnone -// SPIR32-LABEL: define {{[^@]+}}@device_side_enqueue -// SPIR32-SAME: (ptr addrspace(1) align 4 [[A:%.*]], ptr addrspace(1) align 4 [[B:%.*]], i32 [[I:%.*]]) #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META2:![0-9]+]] !kernel_arg_access_qual [[META3:![0-9]+]] !kernel_arg_type [[META4:![0-9]+]] !kernel_arg_base_type [[META4]] !kernel_arg_type_qual [[META5:![0-9]+]] { -// SPIR32-NEXT: entry: -// SPIR32-NEXT: [[A_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4 -// SPIR32-NEXT: [[B_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4 -// SPIR32-NEXT: [[I_ADDR_I:%.*]] = alloca i32, align 4 -// SPIR32-NEXT: [[DEFAULT_QUEUE_I:%.*]] = alloca target("spirv.Queue"), align 4 -// SPIR32-NEXT: [[FLAGS_I:%.*]] = alloca i32, align 4 -// SPIR32-NEXT: [[NDRANGE_I:%.*]] = alloca [[STRUCT_NDRANGE_T:%.*]], align 4 -// SPIR32-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4 -// SPIR32-NEXT: [[BLOCK_I:%.*]] = alloca <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, align 4 +// SPIR32-LABEL: define dso_local spir_kernel void @device_side_enqueue( +// SPIR32-SAME: ptr addrspace(1) align 4 [[A:%.*]], ptr addrspace(1) align 4 [[B:%.*]], i32 [[I:%.*]]) #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META2:![0-9]+]] !kernel_arg_access_qual [[META3:![0-9]+]] !kernel_arg_type [[META4:![0-9]+]] !kernel_arg_base_type [[META4]] !kernel_arg_type_qual [[META5:![0-9]+]] { +// SPIR32-NEXT: [[ENTRY:.*:]] // SPIR32-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 4 // SPIR32-NEXT: [[B_ADDR:%.*]] = alloca ptr addrspace(1), align 4 // SPIR32-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4 @@ -41,36 +33,14 @@ kernel void device_side_enqueue(global float *a, global float *b, int i) { // SPIR32-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 4 // SPIR32-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 4 // SPIR32-NEXT: [[TMP2:%.*]] = load i32, ptr [[I_ADDR]], align 4 -// SPIR32-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[A_ADDR_I]], align 4 -// SPIR32-NEXT: store ptr addrspace(1) [[TMP1]], ptr [[B_ADDR_I]], align 4 -// SPIR32-NEXT: store i32 [[TMP2]], ptr [[I_ADDR_I]], align 4 -// SPIR32-NEXT: store i32 0, ptr [[FLAGS_I]], align 4 -// SPIR32-NEXT: [[TMP3:%.*]] = load target("spirv.Queue"), ptr [[DEFAULT_QUEUE_I]], align 4 -// SPIR32-NEXT: [[TMP4:%.*]] = load i32, ptr [[FLAGS_I]], align 4 -// SPIR32-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP_I]], ptr align 4 [[NDRANGE_I]], i32 4, i1 false) -// SPIR32-NEXT: store i32 24, ptr [[BLOCK_I]], align 4 -// SPIR32-NEXT: [[BLOCK_ALIGN_I:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK_I]], i32 0, i32 1 -// SPIR32-NEXT: store i32 4, ptr [[BLOCK_ALIGN_I]], align 4 -// SPIR32-NEXT: [[BLOCK_INVOKE_I:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK_I]], i32 0, i32 2 -// SPIR32-NEXT: store ptr addrspace(4) addrspacecast (ptr @__device_side_enqueue_block_invoke to ptr addrspace(4)), ptr [[BLOCK_INVOKE_I]], align 4 -// SPIR32-NEXT: [[BLOCK_CAPTURED_I:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK_I]], i32 0, i32 3 -// SPIR32-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_I]], align 4 -// SPIR32-NEXT: store ptr addrspace(1) [[TMP5]], ptr [[BLOCK_CAPTURED_I]], align 4 -// SPIR32-NEXT: [[BLOCK_CAPTURED1_I:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK_I]], i32 0, i32 4 -// SPIR32-NEXT: [[TMP6:%.*]] = load i32, ptr [[I_ADDR_I]], align 4 -// SPIR32-NEXT: store i32 [[TMP6]], ptr [[BLOCK_CAPTURED1_I]], align 4 -// SPIR32-NEXT: [[BLOCK_CAPTURED2_I:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK_I]], i32 0, i32 5 -// SPIR32-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[B_ADDR_I]], align 4 -// SPIR32-NEXT: store ptr addrspace(1) [[TMP7]], ptr [[BLOCK_CAPTURED2_I]], align 4 -// SPIR32-NEXT: [[TMP8:%.*]] = addrspacecast ptr [[BLOCK_I]] to ptr addrspace(4) -// SPIR32-NEXT: [[TMP9:%.*]] = call spir_func i32 @__enqueue_kernel_basic(target("spirv.Queue") [[TMP3]], i32 [[TMP4]], ptr [[TMP_I]], ptr addrspace(4) addrspacecast (ptr @__device_side_enqueue_block_invoke_kernel to ptr addrspace(4)), ptr addrspace(4) [[TMP8]]) #[[ATTR6:[0-9]+]] +// SPIR32-NEXT: call spir_func void @__clang_ocl_kern_imp_device_side_enqueue(ptr addrspace(1) align 4 [[TMP0]], ptr addrspace(1) align 4 [[TMP1]], i32 [[TMP2]]) #[[ATTR5:[0-9]+]] // SPIR32-NEXT: ret void // // -// SPIR32: Function Attrs: alwaysinline convergent norecurse nounwind -// SPIR32-LABEL: define {{[^@]+}}@__clang_ocl_kern_imp_device_side_enqueue -// SPIR32-SAME: (ptr addrspace(1) align 4 [[A:%.*]], ptr addrspace(1) align 4 [[B:%.*]], i32 [[I:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META2]] !kernel_arg_access_qual [[META3]] !kernel_arg_type [[META4]] !kernel_arg_base_type [[META4]] !kernel_arg_type_qual [[META5]] { -// SPIR32-NEXT: entry: +// SPIR32: Function Attrs: convergent noinline norecurse nounwind optnone +// SPIR32-LABEL: define dso_local spir_func void @__clang_ocl_kern_imp_device_side_enqueue( +// SPIR32-SAME: ptr addrspace(1) align 4 [[A:%.*]], ptr addrspace(1) align 4 [[B:%.*]], i32 [[I:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META2]] !kernel_arg_access_qual [[META3]] !kernel_arg_type [[META4]] !kernel_arg_base_type [[META4]] !kernel_arg_type_qual [[META5]] { +// SPIR32-NEXT: [[ENTRY:.*:]] // SPIR32-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 4 // SPIR32-NEXT: [[B_ADDR:%.*]] = alloca ptr addrspace(1), align 4 // SPIR32-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4 @@ -107,9 +77,9 @@ kernel void device_side_enqueue(global float *a, global float *b, int i) { // // // SPIR32: Function Attrs: convergent noinline nounwind optnone -// SPIR32-LABEL: define {{[^@]+}}@__device_side_enqueue_block_invoke -// SPIR32-SAME: (ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR3:[0-9]+]] { -// SPIR32-NEXT: entry: +// SPIR32-LABEL: define internal spir_func void @__device_side_enqueue_block_invoke( +// SPIR32-SAME: ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR2:[0-9]+]] { +// SPIR32-NEXT: [[ENTRY:.*:]] // SPIR32-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr addrspace(4), align 4 // SPIR32-NEXT: [[BLOCK_ADDR:%.*]] = alloca ptr addrspace(4), align 4 // SPIR32-NEXT: store ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR]], align 4 @@ -131,25 +101,17 @@ kernel void device_side_enqueue(global float *a, global float *b, int i) { // // // SPIR32: Function Attrs: convergent nounwind -// SPIR32-LABEL: define {{[^@]+}}@__device_side_enqueue_block_invoke_kernel -// SPIR32-SAME: (ptr addrspace(4) [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] { -// SPIR32-NEXT: entry: +// SPIR32-LABEL: define spir_kernel void @__device_side_enqueue_block_invoke_kernel( +// SPIR32-SAME: ptr addrspace(4) [[TMP0:%.*]]) #[[ATTR4:[0-9]+]] { +// SPIR32-NEXT: [[ENTRY:.*:]] // SPIR32-NEXT: call spir_func void @__device_side_enqueue_block_invoke(ptr addrspace(4) [[TMP0]]) // SPIR32-NEXT: ret void // // // STRICTFP: Function Attrs: convergent noinline norecurse nounwind optnone strictfp -// STRICTFP-LABEL: define {{[^@]+}}@device_side_enqueue -// STRICTFP-SAME: (ptr addrspace(1) align 4 [[A:%.*]], ptr addrspace(1) align 4 [[B:%.*]], i32 [[I:%.*]]) #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META2:![0-9]+]] !kernel_arg_access_qual [[META3:![0-9]+]] !kernel_arg_type [[META4:![0-9]+]] !kernel_arg_base_type [[META4]] !kernel_arg_type_qual [[META5:![0-9]+]] { -// STRICTFP-NEXT: entry: -// STRICTFP-NEXT: [[A_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4 -// STRICTFP-NEXT: [[B_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4 -// STRICTFP-NEXT: [[I_ADDR_I:%.*]] = alloca i32, align 4 -// STRICTFP-NEXT: [[DEFAULT_QUEUE_I:%.*]] = alloca target("spirv.Queue"), align 4 -// STRICTFP-NEXT: [[FLAGS_I:%.*]] = alloca i32, align 4 -// STRICTFP-NEXT: [[NDRANGE_I:%.*]] = alloca [[STRUCT_NDRANGE_T:%.*]], align 4 -// STRICTFP-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4 -// STRICTFP-NEXT: [[BLOCK_I:%.*]] = alloca <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, align 4 +// STRICTFP-LABEL: define dso_local spir_kernel void @device_side_enqueue( +// STRICTFP-SAME: ptr addrspace(1) align 4 [[A:%.*]], ptr addrspace(1) align 4 [[B:%.*]], i32 [[I:%.*]]) #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META2:![0-9]+]] !kernel_arg_access_qual [[META3:![0-9]+]] !kernel_arg_type [[META4:![0-9]+]] !kernel_arg_base_type [[META4]] !kernel_arg_type_qual [[META5:![0-9]+]] { +// STRICTFP-NEXT: [[ENTRY:.*:]] // STRICTFP-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 4 // STRICTFP-NEXT: [[B_ADDR:%.*]] = alloca ptr addrspace(1), align 4 // STRICTFP-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4 @@ -159,36 +121,14 @@ kernel void device_side_enqueue(global float *a, global float *b, int i) { // STRICTFP-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 4 // STRICTFP-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 4 // STRICTFP-NEXT: [[TMP2:%.*]] = load i32, ptr [[I_ADDR]], align 4 -// STRICTFP-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[A_ADDR_I]], align 4 -// STRICTFP-NEXT: store ptr addrspace(1) [[TMP1]], ptr [[B_ADDR_I]], align 4 -// STRICTFP-NEXT: store i32 [[TMP2]], ptr [[I_ADDR_I]], align 4 -// STRICTFP-NEXT: store i32 0, ptr [[FLAGS_I]], align 4 -// STRICTFP-NEXT: [[TMP3:%.*]] = load target("spirv.Queue"), ptr [[DEFAULT_QUEUE_I]], align 4 -// STRICTFP-NEXT: [[TMP4:%.*]] = load i32, ptr [[FLAGS_I]], align 4 -// STRICTFP-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP_I]], ptr align 4 [[NDRANGE_I]], i32 4, i1 false) #[[ATTR6:[0-9]+]] -// STRICTFP-NEXT: store i32 24, ptr [[BLOCK_I]], align 4 -// STRICTFP-NEXT: [[BLOCK_ALIGN_I:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK_I]], i32 0, i32 1 -// STRICTFP-NEXT: store i32 4, ptr [[BLOCK_ALIGN_I]], align 4 -// STRICTFP-NEXT: [[BLOCK_INVOKE_I:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK_I]], i32 0, i32 2 -// STRICTFP-NEXT: store ptr addrspace(4) addrspacecast (ptr @__device_side_enqueue_block_invoke to ptr addrspace(4)), ptr [[BLOCK_INVOKE_I]], align 4 -// STRICTFP-NEXT: [[BLOCK_CAPTURED_I:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK_I]], i32 0, i32 3 -// STRICTFP-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_I]], align 4 -// STRICTFP-NEXT: store ptr addrspace(1) [[TMP5]], ptr [[BLOCK_CAPTURED_I]], align 4 -// STRICTFP-NEXT: [[BLOCK_CAPTURED1_I:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK_I]], i32 0, i32 4 -// STRICTFP-NEXT: [[TMP6:%.*]] = load i32, ptr [[I_ADDR_I]], align 4 -// STRICTFP-NEXT: store i32 [[TMP6]], ptr [[BLOCK_CAPTURED1_I]], align 4 -// STRICTFP-NEXT: [[BLOCK_CAPTURED2_I:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK_I]], i32 0, i32 5 -// STRICTFP-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[B_ADDR_I]], align 4 -// STRICTFP-NEXT: store ptr addrspace(1) [[TMP7]], ptr [[BLOCK_CAPTURED2_I]], align 4 -// STRICTFP-NEXT: [[TMP8:%.*]] = addrspacecast ptr [[BLOCK_I]] to ptr addrspace(4) -// STRICTFP-NEXT: [[TMP9:%.*]] = call spir_func i32 @__enqueue_kernel_basic(target("spirv.Queue") [[TMP3]], i32 [[TMP4]], ptr [[TMP_I]], ptr addrspace(4) addrspacecast (ptr @__device_side_enqueue_block_invoke_kernel to ptr addrspace(4)), ptr addrspace(4) [[TMP8]]) #[[ATTR7:[0-9]+]] +// STRICTFP-NEXT: call spir_func void @__clang_ocl_kern_imp_device_side_enqueue(ptr addrspace(1) align 4 [[TMP0]], ptr addrspace(1) align 4 [[TMP1]], i32 [[TMP2]]) #[[ATTR5:[0-9]+]] // STRICTFP-NEXT: ret void // // -// STRICTFP: Function Attrs: alwaysinline convergent norecurse nounwind strictfp -// STRICTFP-LABEL: define {{[^@]+}}@__clang_ocl_kern_imp_device_side_enqueue -// STRICTFP-SAME: (ptr addrspace(1) align 4 [[A:%.*]], ptr addrspace(1) align 4 [[B:%.*]], i32 [[I:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META2]] !kernel_arg_access_qual [[META3]] !kernel_arg_type [[META4]] !kernel_arg_base_type [[META4]] !kernel_arg_type_qual [[META5]] { -// STRICTFP-NEXT: entry: +// STRICTFP: Function Attrs: convergent noinline norecurse nounwind optnone strictfp +// STRICTFP-LABEL: define dso_local spir_func void @__clang_ocl_kern_imp_device_side_enqueue( +// STRICTFP-SAME: ptr addrspace(1) align 4 [[A:%.*]], ptr addrspace(1) align 4 [[B:%.*]], i32 [[I:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META2]] !kernel_arg_access_qual [[META3]] !kernel_arg_type [[META4]] !kernel_arg_base_type [[META4]] !kernel_arg_type_qual [[META5]] { +// STRICTFP-NEXT: [[ENTRY:.*:]] // STRICTFP-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 4 // STRICTFP-NEXT: [[B_ADDR:%.*]] = alloca ptr addrspace(1), align 4 // STRICTFP-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4 @@ -203,7 +143,7 @@ kernel void device_side_enqueue(global float *a, global float *b, int i) { // STRICTFP-NEXT: store i32 0, ptr [[FLAGS]], align 4 // STRICTFP-NEXT: [[TMP0:%.*]] = load target("spirv.Queue"), ptr [[DEFAULT_QUEUE]], align 4 // STRICTFP-NEXT: [[TMP1:%.*]] = load i32, ptr [[FLAGS]], align 4 -// STRICTFP-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP]], ptr align 4 [[NDRANGE]], i32 4, i1 false) #[[ATTR6]] +// STRICTFP-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP]], ptr align 4 [[NDRANGE]], i32 4, i1 false) #[[ATTR6:[0-9]+]] // STRICTFP-NEXT: [[BLOCK_SIZE:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK]], i32 0, i32 0 // STRICTFP-NEXT: store i32 24, ptr [[BLOCK_SIZE]], align 4 // STRICTFP-NEXT: [[BLOCK_ALIGN:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK]], i32 0, i32 1 @@ -225,9 +165,9 @@ kernel void device_side_enqueue(global float *a, global float *b, int i) { // // // STRICTFP: Function Attrs: convergent noinline nounwind optnone strictfp -// STRICTFP-LABEL: define {{[^@]+}}@__device_side_enqueue_block_invoke -// STRICTFP-SAME: (ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR3:[0-9]+]] { -// STRICTFP-NEXT: entry: +// STRICTFP-LABEL: define internal spir_func void @__device_side_enqueue_block_invoke( +// STRICTFP-SAME: ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR2:[0-9]+]] { +// STRICTFP-NEXT: [[ENTRY:.*:]] // STRICTFP-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr addrspace(4), align 4 // STRICTFP-NEXT: [[BLOCK_ADDR:%.*]] = alloca ptr addrspace(4), align 4 // STRICTFP-NEXT: store ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR]], align 4 @@ -249,29 +189,27 @@ kernel void device_side_enqueue(global float *a, global float *b, int i) { // // // STRICTFP: Function Attrs: convergent nounwind -// STRICTFP-LABEL: define {{[^@]+}}@__device_side_enqueue_block_invoke_kernel -// STRICTFP-SAME: (ptr addrspace(4) [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] { -// STRICTFP-NEXT: entry: +// STRICTFP-LABEL: define spir_kernel void @__device_side_enqueue_block_invoke_kernel( +// STRICTFP-SAME: ptr addrspace(4) [[TMP0:%.*]]) #[[ATTR4:[0-9]+]] { +// STRICTFP-NEXT: [[ENTRY:.*:]] // STRICTFP-NEXT: call spir_func void @__device_side_enqueue_block_invoke(ptr addrspace(4) [[TMP0]]) #[[ATTR6]] // STRICTFP-NEXT: ret void // //. // SPIR32: attributes #[[ATTR0]] = { convergent noinline norecurse nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } -// SPIR32: attributes #[[ATTR1]] = { alwaysinline convergent norecurse nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } -// SPIR32: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } -// SPIR32: attributes #[[ATTR3]] = { convergent noinline nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } -// SPIR32: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -// SPIR32: attributes #[[ATTR5]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } -// SPIR32: attributes #[[ATTR6]] = { nounwind } +// SPIR32: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } +// SPIR32: attributes #[[ATTR2]] = { convergent noinline nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// SPIR32: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +// SPIR32: attributes #[[ATTR4]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// SPIR32: attributes #[[ATTR5]] = { convergent nounwind "uniform-work-group-size"="true" } //. // STRICTFP: attributes #[[ATTR0]] = { convergent noinline norecurse nounwind optnone strictfp "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" } -// STRICTFP: attributes #[[ATTR1]] = { alwaysinline convergent norecurse nounwind strictfp "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" } -// STRICTFP: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } -// STRICTFP: attributes #[[ATTR3]] = { convergent noinline nounwind optnone strictfp "stack-protector-buffer-size"="8" } -// STRICTFP: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind strictfp willreturn memory(inaccessiblemem: readwrite) } -// STRICTFP: attributes #[[ATTR5]] = { convergent nounwind "stack-protector-buffer-size"="8" } +// STRICTFP: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } +// STRICTFP: attributes #[[ATTR2]] = { convergent noinline nounwind optnone strictfp "stack-protector-buffer-size"="8" } +// STRICTFP: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind strictfp willreturn memory(inaccessiblemem: readwrite) } +// STRICTFP: attributes #[[ATTR4]] = { convergent nounwind "stack-protector-buffer-size"="8" } +// STRICTFP: attributes #[[ATTR5]] = { convergent nounwind strictfp "uniform-work-group-size"="false" } // STRICTFP: attributes #[[ATTR6]] = { strictfp } -// STRICTFP: attributes #[[ATTR7]] = { nounwind strictfp } //. // SPIR32: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} // SPIR32: [[META1:![0-9]+]] = !{i32 2, i32 0} diff --git a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl index 4809168b7fc8b..3355fe1c25819 100644 --- a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl +++ b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl @@ -1,12 +1,12 @@ -// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B32,SPIR -// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B64,SPIR -// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefix=CHECK-LIFETIMES -// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B32,SPIR -// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B64,SPIR -// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefix=CHECK-LIFETIMES -// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "x86_64-unknown-linux-gnu" | FileCheck %s --check-prefixes=COMMON,B64,X86 -// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "x86_64-unknown-linux-gnu" | FileCheck %s --check-prefixes=COMMON,B64,X86 -// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "x86_64-unknown-linux-gnu" | FileCheck %s --check-prefix=CHECK-LIFETIMES +// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B32,SPIR,TRIPLESPIR +// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B64,SPIR,TRIPLESPIR +// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefixes=CHECK-LIFETIMES,TRIPLESPIR +// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B32,SPIR,TRIPLESPIR +// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B64,SPIR,TRIPLESPIR +// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefixes=CHECK-LIFETIMES,TRIPLESPIR +// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "x86_64-unknown-linux-gnu" | FileCheck %s --check-prefixes=COMMON,B64,X86,TRIPLEX86 +// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "x86_64-unknown-linux-gnu" | FileCheck %s --check-prefixes=COMMON,B64,X86,TRIPLEX86 +// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "x86_64-unknown-linux-gnu" | FileCheck %s --check-prefixes=CHECK-LIFETIMES,TRIPLEX86 #pragma OPENCL EXTENSION cl_khr_subgroups : enable @@ -39,6 +39,12 @@ void callee(int id, __global int *out) { out[id] = id; } +// TRIPLESPIR: define{{.*}} void @device_side_enqueue(ptr addrspace(1) align 4 %{{.*}}, ptr addrspace(1) align 4 %b, i32 %i) +// TRIPLESPIR: call spir_func void @__clang_ocl_kern_imp_device_side_enqueue({{.*}}) + +// TRIPLEX86: define{{.*}} void @device_side_enqueue(ptr addrspace(1) align 4 %{{.*}}, ptr addrspace(1) align 4 %b, i32 %i) +// TRIPLEX86: call void @__clang_ocl_kern_imp_device_side_enqueue({{.*}}) + // COMMON-LABEL: define{{.*}} void @__clang_ocl_kern_imp_device_side_enqueue(ptr addrspace(1) align 4 %{{.*}}, ptr addrspace(1) align 4 %b, i32 %i) kernel void device_side_enqueue(global int *a, global int *b, int i) { // SPIR: %default_queue = alloca target("spirv.Queue") diff --git a/clang/test/CodeGenOpenCL/convergent.cl b/clang/test/CodeGenOpenCL/convergent.cl index 53a35a4f73119..123adba7b40d2 100644 --- a/clang/test/CodeGenOpenCL/convergent.cl +++ b/clang/test/CodeGenOpenCL/convergent.cl @@ -127,7 +127,7 @@ void test_not_unroll() { // CHECK: declare spir_func void @nodupfun(){{[^#]*}} #[[attr3:[0-9]+]] // CHECK-LABEL: @assume_convergent_asm -// CHECK: tail call void asm sideeffect "s_barrier", ""() #6 +// CHECK: tail call void asm sideeffect "s_barrier", ""() #5 kernel void assume_convergent_asm() { __asm__ volatile("s_barrier"); @@ -138,7 +138,6 @@ kernel void assume_convergent_asm() // CHECK: attributes #2 = { {{[^}]*}}convergent{{[^}]*}} } // CHECK: attributes #3 = { {{[^}]*}}convergent noduplicate{{[^}]*}} } // CHECK: attributes #4 = { {{[^}]*}}convergent{{[^}]*}} } -// CHECK: attributes #5 = { {{[^}]*}}alwaysinline convergent{{[^}]*}} } -// CHECK: attributes #6 = { {{[^}]*}}convergent{{[^}]*}} } -// CHECK: attributes #7 = { {{[^}]*}}nounwind{{[^}]*}} } -// CHECK: attributes #8 = { {{[^}]*}}convergent noduplicate nounwind{{[^}]*}} } +// CHECK: attributes #5 = { {{[^}]*}}convergent{{[^}]*}} } +// CHECK: attributes #6 = { {{[^}]*}}nounwind{{[^}]*}} } +// CHECK: attributes #7 = { {{[^}]*}}convergent noduplicate nounwind{{[^}]*}} } diff --git a/clang/test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl b/clang/test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl index 8e970f121bca8..e741cf63f30b5 100644 --- a/clang/test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl +++ b/clang/test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl @@ -9,8 +9,15 @@ typedef struct {int a;} ndrange_t; kernel void test(int i) { + // AMDGPU-LABEL: define {{.*}} amdgpu_kernel void @test +// AMDGPU-LABEL: call void @__clang_ocl_kern_imp_test(i32 noundef %0) + // SPIR-LABEL: define {{.*}} spir_kernel void @test +// SPIR-LABEL: call spir_func void @__clang_ocl_kern_imp_test(i32 noundef %0) + +// AMDGPU-LABEL: define {{.*}} void @__clang_ocl_kern_imp_test +// SPIR-LABEL: define {{.*}} spir_func void @__clang_ocl_kern_imp_test // COMMON-LABEL: entry: // AMDGPU: %block_sizes = alloca [1 x i64] @@ -37,5 +44,5 @@ kernel void test(int i) { // CHECK-DEBUG: ![[TESTFILE:[0-9]+]] = !DIFile(filename: "" // CHECK-DEBUG: ![[TESTSCOPE:[0-9]+]] = distinct !DISubprogram(name: "test", linkageName: "__clang_ocl_kern_imp_test", {{.*}} file: ![[TESTFILE]] -// CHECK-DEBUG: ![[IFSCOPE:[0-9]+]] = distinct !DILexicalBlock(scope: ![[TESTSCOPE]], file: ![[TESTFILE]], line: 26) -// CHECK-DEBUG: ![[TEMPLOCATION]] = !DILocation(line: 27, scope: ![[IFSCOPE]]) +// CHECK-DEBUG: ![[IFSCOPE:[0-9]+]] = distinct !DILexicalBlock(scope: ![[TESTSCOPE]], file: ![[TESTFILE]], line: 33) +// CHECK-DEBUG: ![[TEMPLOCATION]] = !DILocation(line: 34, scope: ![[IFSCOPE]]) diff --git a/clang/test/CodeGenOpenCL/implicit-addrspacecast-function-parameter.cl b/clang/test/CodeGenOpenCL/implicit-addrspacecast-function-parameter.cl index ae560898c2be7..86ca9ae509073 100644 --- a/clang/test/CodeGenOpenCL/implicit-addrspacecast-function-parameter.cl +++ b/clang/test/CodeGenOpenCL/implicit-addrspacecast-function-parameter.cl @@ -57,7 +57,7 @@ __kernel void use_of_local_var() // CHECK-LABEL: define dso_local amdgpu_kernel void @use_of_local_var( // CHECK-SAME: ) #[[ATTR3:[0-9]+]] !kernel_arg_addr_space [[META8:![0-9]+]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META8]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: call void @__clang_ocl_kern_imp_use_of_local_var() #[[ATTR7:[0-9]+]] +// CHECK-NEXT: call void @__clang_ocl_kern_imp_use_of_local_var() #[[ATTR6]] // CHECK-NEXT: ret void // // diff --git a/clang/test/CodeGenOpenCL/opencl-kernel-call.cl b/clang/test/CodeGenOpenCL/opencl-kernel-call.cl index c08c56912644c..cdbe510b723b9 100644 --- a/clang/test/CodeGenOpenCL/opencl-kernel-call.cl +++ b/clang/test/CodeGenOpenCL/opencl-kernel-call.cl @@ -111,7 +111,7 @@ kernel void ext_KernelLargeTwoMember(struct LargeStructTwoMember u); kernel void caller_kern(global int* A, global Mat3X3 *mat3X3, global Mat4X4 *mat4X4, global Mat32X32 *mat32X32, global Mat64X64 *mat64X64){ callee_kern(A); ext_callee_kern(A); - + callee_kern_Mat3X3(mat3X3, mat4X4); callee_kern_Mat32X32(mat32X32, mat64X64); ext_callee_kern_Mat3X3(mat3X3, mat4X4); @@ -211,19 +211,16 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // X86-LABEL: define spir_kernel void @callee_kern( // X86-SAME: ptr noundef align 4 [[A:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { // X86-NEXT: entry: -// X86-NEXT: [[A_ADDR_I:%.*]] = alloca ptr, align 4 // X86-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4 // X86-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4 // X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4 -// X86-NEXT: store ptr [[TMP0]], ptr [[A_ADDR_I]], align 4 -// X86-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR_I]], align 4 -// X86-NEXT: store i32 1, ptr [[TMP1]], align 4 +// X86-NEXT: call void @__clang_ocl_kern_imp_callee_kern(ptr noundef align 4 [[TMP0]]) #[[ATTR4:[0-9]+]] // X86-NEXT: ret void // // -// X86: Function Attrs: alwaysinline convergent norecurse nounwind +// X86: Function Attrs: convergent noinline norecurse nounwind optnone // X86-LABEL: define void @__clang_ocl_kern_imp_callee_kern( -// X86-SAME: ptr noundef align 4 [[A:%.*]]) #[[ATTR3:[0-9]+]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] { +// X86-SAME: ptr noundef align 4 [[A:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] { // X86-NEXT: entry: // X86-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4 // X86-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4 @@ -236,28 +233,19 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // X86-LABEL: define spir_kernel void @callee_kern_Mat3X3( // X86-SAME: ptr noundef align 4 [[IN:%.*]], ptr noundef align 4 [[OUT:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META8:![0-9]+]] !kernel_arg_access_qual [[META9:![0-9]+]] !kernel_arg_type [[META10:![0-9]+]] !kernel_arg_base_type [[META10]] !kernel_arg_type_qual [[META11:![0-9]+]] { // X86-NEXT: entry: -// X86-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr, align 4 -// X86-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr, align 4 -// X86-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4 // X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr, align 4 // X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 4 // X86-NEXT: store ptr [[IN]], ptr [[IN_ADDR]], align 4 // X86-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR]], align 4 // X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[IN_ADDR]], align 4 // X86-NEXT: [[TMP1:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4 -// X86-NEXT: store ptr [[TMP0]], ptr [[IN_ADDR_I]], align 4 -// X86-NEXT: store ptr [[TMP1]], ptr [[OUT_ADDR_I]], align 4 -// X86-NEXT: [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR_I]], align 4 -// X86-NEXT: [[TMP3:%.*]] = load ptr, ptr [[IN_ADDR_I]], align 4 -// X86-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr [[TMP3]], i32 1 -// X86-NEXT: call void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP_I]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[ARRAYIDX1_I]]) #[[ATTR5:[0-9]+]] -// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP2]], ptr align 4 [[TMP_I]], i32 64, i1 false) +// X86-NEXT: call void @__clang_ocl_kern_imp_callee_kern_Mat3X3(ptr noundef align 4 [[TMP0]], ptr noundef align 4 [[TMP1]]) #[[ATTR4]] // X86-NEXT: ret void // // -// X86: Function Attrs: alwaysinline convergent norecurse nounwind +// X86: Function Attrs: convergent noinline norecurse nounwind optnone // X86-LABEL: define void @__clang_ocl_kern_imp_callee_kern_Mat3X3( -// X86-SAME: ptr noundef align 4 [[IN:%.*]], ptr noundef align 4 [[OUT:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META8]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META10]] !kernel_arg_base_type [[META10]] !kernel_arg_type_qual [[META11]] { +// X86-SAME: ptr noundef align 4 [[IN:%.*]], ptr noundef align 4 [[OUT:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META8]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META10]] !kernel_arg_base_type [[META10]] !kernel_arg_type_qual [[META11]] { // X86-NEXT: entry: // X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr, align 4 // X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 4 @@ -268,7 +256,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // X86-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr [[TMP0]], i32 0 // X86-NEXT: [[TMP1:%.*]] = load ptr, ptr [[IN_ADDR]], align 4 // X86-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr [[TMP1]], i32 1 -// X86-NEXT: call void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[ARRAYIDX1]]) #[[ATTR5]] +// X86-NEXT: call void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[ARRAYIDX1]]) #[[ATTR4]] // X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 64, i1 false) // X86-NEXT: ret void // @@ -277,28 +265,19 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // X86-LABEL: define spir_kernel void @callee_kern_Mat32X32( // X86-SAME: ptr noundef align 4 [[IN:%.*]], ptr noundef align 4 [[OUT:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META8]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META11]] { // X86-NEXT: entry: -// X86-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr, align 4 -// X86-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr, align 4 -// X86-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4 // X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr, align 4 // X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 4 // X86-NEXT: store ptr [[IN]], ptr [[IN_ADDR]], align 4 // X86-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR]], align 4 // X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[IN_ADDR]], align 4 // X86-NEXT: [[TMP1:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4 -// X86-NEXT: store ptr [[TMP0]], ptr [[IN_ADDR_I]], align 4 -// X86-NEXT: store ptr [[TMP1]], ptr [[OUT_ADDR_I]], align 4 -// X86-NEXT: [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR_I]], align 4 -// X86-NEXT: [[TMP3:%.*]] = load ptr, ptr [[IN_ADDR_I]], align 4 -// X86-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32:%.*]], ptr [[TMP3]], i32 1 -// X86-NEXT: call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_I]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[ARRAYIDX1_I]]) #[[ATTR5]] -// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP2]], ptr align 4 [[TMP_I]], i32 16384, i1 false) +// X86-NEXT: call void @__clang_ocl_kern_imp_callee_kern_Mat32X32(ptr noundef align 4 [[TMP0]], ptr noundef align 4 [[TMP1]]) #[[ATTR4]] // X86-NEXT: ret void // // -// X86: Function Attrs: alwaysinline convergent norecurse nounwind +// X86: Function Attrs: convergent noinline norecurse nounwind optnone // X86-LABEL: define void @__clang_ocl_kern_imp_callee_kern_Mat32X32( -// X86-SAME: ptr noundef align 4 [[IN:%.*]], ptr noundef align 4 [[OUT:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META8]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META11]] { +// X86-SAME: ptr noundef align 4 [[IN:%.*]], ptr noundef align 4 [[OUT:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META8]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META11]] { // X86-NEXT: entry: // X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr, align 4 // X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 4 @@ -309,7 +288,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // X86-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr [[TMP0]], i32 0 // X86-NEXT: [[TMP1:%.*]] = load ptr, ptr [[IN_ADDR]], align 4 // X86-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32:%.*]], ptr [[TMP1]], i32 1 -// X86-NEXT: call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[ARRAYIDX1]]) #[[ATTR5]] +// X86-NEXT: call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[ARRAYIDX1]]) #[[ATTR4]] // X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 16384, i1 false) // X86-NEXT: ret void // @@ -318,21 +297,17 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // X86-LABEL: define spir_kernel void @KernelOneMember( // X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13:![0-9]+]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META14:![0-9]+]] !kernel_arg_base_type [[META14]] !kernel_arg_type_qual [[META7]] { // X86-NEXT: entry: -// X86-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8 -// X86-NEXT: [[U1:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8 -// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[U1]], ptr align 1 [[U]], i64 8, i1 false) -// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I]], ptr align 4 [[U1]], i32 8, i1 false) -// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U_I]]) #[[ATTR5]] +// X86-NEXT: call void @__clang_ocl_kern_imp_KernelOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR4]] // X86-NEXT: ret void // // -// X86: Function Attrs: alwaysinline convergent norecurse nounwind +// X86: Function Attrs: convergent noinline norecurse nounwind optnone // X86-LABEL: define void @__clang_ocl_kern_imp_KernelOneMember( -// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META14]] !kernel_arg_base_type [[META14]] !kernel_arg_type_qual [[META7]] { +// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META14]] !kernel_arg_base_type [[META14]] !kernel_arg_type_qual [[META7]] { // X86-NEXT: entry: // X86-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8 // X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 8, i1 false) -// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR5]] +// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR4]] // X86-NEXT: ret void // // @@ -340,21 +315,17 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // X86-LABEL: define spir_kernel void @KernelLargeOneMember( // X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META7]] { // X86-NEXT: entry: -// X86-NEXT: [[U_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8 -// X86-NEXT: [[U1:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8 -// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[U1]], ptr align 1 [[U]], i64 800, i1 false) -// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I]], ptr align 4 [[U1]], i32 800, i1 false) -// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U_I]]) #[[ATTR5]] +// X86-NEXT: call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR4]] // X86-NEXT: ret void // // -// X86: Function Attrs: alwaysinline convergent norecurse nounwind +// X86: Function Attrs: convergent noinline norecurse nounwind optnone // X86-LABEL: define void @__clang_ocl_kern_imp_KernelLargeOneMember( -// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META15]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META7]] { +// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META15]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META7]] { // X86-NEXT: entry: // X86-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8 // X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 800, i1 false) -// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR5]] +// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR4]] // X86-NEXT: ret void // // @@ -362,21 +333,17 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // X86-LABEL: define spir_kernel void @KernelTwoMember( // X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META7]] { // X86-NEXT: entry: -// X86-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8 -// X86-NEXT: [[U1:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8 -// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[U1]], ptr align 1 [[U]], i64 16, i1 false) -// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I]], ptr align 4 [[U1]], i32 16, i1 false) -// X86-NEXT: call void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U_I]]) #[[ATTR5]] +// X86-NEXT: call void @__clang_ocl_kern_imp_KernelTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR4]] // X86-NEXT: ret void // // -// X86: Function Attrs: alwaysinline convergent norecurse nounwind +// X86: Function Attrs: convergent noinline norecurse nounwind optnone // X86-LABEL: define void @__clang_ocl_kern_imp_KernelTwoMember( -// X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META16]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META7]] { +// X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META16]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META7]] { // X86-NEXT: entry: // X86-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8 // X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 16, i1 false) -// X86-NEXT: call void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR5]] +// X86-NEXT: call void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR4]] // X86-NEXT: ret void // // @@ -384,21 +351,17 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // X86-LABEL: define spir_kernel void @KernelLargeTwoMember( // X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META7]] { // X86-NEXT: entry: -// X86-NEXT: [[U_I:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 -// X86-NEXT: [[U1:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 -// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[U1]], ptr align 1 [[U]], i64 480, i1 false) -// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I]], ptr align 4 [[U1]], i32 480, i1 false) -// X86-NEXT: call void @FuncLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[U_I]]) #[[ATTR5]] +// X86-NEXT: call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR4]] // X86-NEXT: ret void // // -// X86: Function Attrs: alwaysinline convergent norecurse nounwind +// X86: Function Attrs: convergent noinline norecurse nounwind optnone // X86-LABEL: define void @__clang_ocl_kern_imp_KernelLargeTwoMember( -// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META17]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META7]] { +// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META17]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META7]] { // X86-NEXT: entry: // X86-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 // X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 480, i1 false) -// X86-NEXT: call void @FuncLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR5]] +// X86-NEXT: call void @FuncLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR4]] // X86-NEXT: ret void // // @@ -406,18 +369,6 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // X86-LABEL: define spir_kernel void @caller_kern( // X86-SAME: ptr noundef align 4 [[A:%.*]], ptr noundef align 4 [[MAT3X3:%.*]], ptr noundef align 4 [[MAT4X4:%.*]], ptr noundef align 4 [[MAT32X32:%.*]], ptr noundef align 4 [[MAT64X64:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META18:![0-9]+]] !kernel_arg_access_qual [[META19:![0-9]+]] !kernel_arg_type [[META20:![0-9]+]] !kernel_arg_base_type [[META20]] !kernel_arg_type_qual [[META21:![0-9]+]] { // X86-NEXT: entry: -// X86-NEXT: [[IN_ADDR_I1_I:%.*]] = alloca ptr, align 4 -// X86-NEXT: [[OUT_ADDR_I2_I:%.*]] = alloca ptr, align 4 -// X86-NEXT: [[TMP_I3_I:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4 -// X86-NEXT: [[IN_ADDR_I_I:%.*]] = alloca ptr, align 4 -// X86-NEXT: [[OUT_ADDR_I_I:%.*]] = alloca ptr, align 4 -// X86-NEXT: [[TMP_I_I:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4 -// X86-NEXT: [[A_ADDR_I_I:%.*]] = alloca ptr, align 4 -// X86-NEXT: [[A_ADDR_I:%.*]] = alloca ptr, align 4 -// X86-NEXT: [[MAT3X3_ADDR_I:%.*]] = alloca ptr, align 4 -// X86-NEXT: [[MAT4X4_ADDR_I:%.*]] = alloca ptr, align 4 -// X86-NEXT: [[MAT32X32_ADDR_I:%.*]] = alloca ptr, align 4 -// X86-NEXT: [[MAT64X64_ADDR_I:%.*]] = alloca ptr, align 4 // X86-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4 // X86-NEXT: [[MAT3X3_ADDR:%.*]] = alloca ptr, align 4 // X86-NEXT: [[MAT4X4_ADDR:%.*]] = alloca ptr, align 4 @@ -433,55 +384,14 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // X86-NEXT: [[TMP2:%.*]] = load ptr, ptr [[MAT4X4_ADDR]], align 4 // X86-NEXT: [[TMP3:%.*]] = load ptr, ptr [[MAT32X32_ADDR]], align 4 // X86-NEXT: [[TMP4:%.*]] = load ptr, ptr [[MAT64X64_ADDR]], align 4 -// X86-NEXT: store ptr [[TMP0]], ptr [[A_ADDR_I]], align 4 -// X86-NEXT: store ptr [[TMP1]], ptr [[MAT3X3_ADDR_I]], align 4 -// X86-NEXT: store ptr [[TMP2]], ptr [[MAT4X4_ADDR_I]], align 4 -// X86-NEXT: store ptr [[TMP3]], ptr [[MAT32X32_ADDR_I]], align 4 -// X86-NEXT: store ptr [[TMP4]], ptr [[MAT64X64_ADDR_I]], align 4 -// X86-NEXT: [[TMP5:%.*]] = load ptr, ptr [[A_ADDR_I]], align 4 -// X86-NEXT: store ptr [[TMP5]], ptr [[A_ADDR_I_I]], align 4 -// X86-NEXT: [[TMP6:%.*]] = load ptr, ptr [[A_ADDR_I_I]], align 4 -// X86-NEXT: store i32 1, ptr [[TMP6]], align 4 -// X86-NEXT: [[TMP7:%.*]] = load ptr, ptr [[A_ADDR_I]], align 4 -// X86-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern(ptr noundef align 4 [[TMP7]]) #[[ATTR6:[0-9]+]] -// X86-NEXT: [[TMP8:%.*]] = load ptr, ptr [[MAT3X3_ADDR_I]], align 4 -// X86-NEXT: [[TMP9:%.*]] = load ptr, ptr [[MAT4X4_ADDR_I]], align 4 -// X86-NEXT: store ptr [[TMP8]], ptr [[IN_ADDR_I_I]], align 4 -// X86-NEXT: store ptr [[TMP9]], ptr [[OUT_ADDR_I_I]], align 4 -// X86-NEXT: [[TMP10:%.*]] = load ptr, ptr [[OUT_ADDR_I_I]], align 4 -// X86-NEXT: [[TMP11:%.*]] = load ptr, ptr [[IN_ADDR_I_I]], align 4 -// X86-NEXT: [[ARRAYIDX1_I_I:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr [[TMP11]], i32 1 -// X86-NEXT: call void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP_I_I]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[ARRAYIDX1_I_I]]) #[[ATTR5]] -// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP10]], ptr align 4 [[TMP_I_I]], i32 64, i1 false) -// X86-NEXT: [[TMP12:%.*]] = load ptr, ptr [[MAT32X32_ADDR_I]], align 4 -// X86-NEXT: [[TMP13:%.*]] = load ptr, ptr [[MAT64X64_ADDR_I]], align 4 -// X86-NEXT: store ptr [[TMP12]], ptr [[IN_ADDR_I1_I]], align 4 -// X86-NEXT: store ptr [[TMP13]], ptr [[OUT_ADDR_I2_I]], align 4 -// X86-NEXT: [[TMP14:%.*]] = load ptr, ptr [[OUT_ADDR_I2_I]], align 4 -// X86-NEXT: [[TMP15:%.*]] = load ptr, ptr [[IN_ADDR_I1_I]], align 4 -// X86-NEXT: [[ARRAYIDX1_I4_I:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32:%.*]], ptr [[TMP15]], i32 1 -// X86-NEXT: call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_I3_I]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[ARRAYIDX1_I4_I]]) #[[ATTR5]] -// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP14]], ptr align 4 [[TMP_I3_I]], i32 16384, i1 false) -// X86-NEXT: [[TMP16:%.*]] = load ptr, ptr [[MAT3X3_ADDR_I]], align 4 -// X86-NEXT: [[TMP17:%.*]] = load ptr, ptr [[MAT4X4_ADDR_I]], align 4 -// X86-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern_Mat3X3(ptr noundef align 4 [[TMP16]], ptr noundef align 4 [[TMP17]]) #[[ATTR6]] -// X86-NEXT: [[TMP18:%.*]] = load ptr, ptr [[MAT32X32_ADDR_I]], align 4 -// X86-NEXT: [[TMP19:%.*]] = load ptr, ptr [[MAT64X64_ADDR_I]], align 4 -// X86-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern_Mat32X32(ptr noundef align 4 [[TMP18]], ptr noundef align 4 [[TMP19]]) #[[ATTR6]] +// X86-NEXT: call void @__clang_ocl_kern_imp_caller_kern(ptr noundef align 4 [[TMP0]], ptr noundef align 4 [[TMP1]], ptr noundef align 4 [[TMP2]], ptr noundef align 4 [[TMP3]], ptr noundef align 4 [[TMP4]]) #[[ATTR4]] // X86-NEXT: ret void // // -// X86: Function Attrs: alwaysinline convergent norecurse nounwind +// X86: Function Attrs: convergent noinline norecurse nounwind optnone // X86-LABEL: define void @__clang_ocl_kern_imp_caller_kern( -// X86-SAME: ptr noundef align 4 [[A:%.*]], ptr noundef align 4 [[MAT3X3:%.*]], ptr noundef align 4 [[MAT4X4:%.*]], ptr noundef align 4 [[MAT32X32:%.*]], ptr noundef align 4 [[MAT64X64:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META18]] !kernel_arg_access_qual [[META19]] !kernel_arg_type [[META20]] !kernel_arg_base_type [[META20]] !kernel_arg_type_qual [[META21]] { +// X86-SAME: ptr noundef align 4 [[A:%.*]], ptr noundef align 4 [[MAT3X3:%.*]], ptr noundef align 4 [[MAT4X4:%.*]], ptr noundef align 4 [[MAT32X32:%.*]], ptr noundef align 4 [[MAT64X64:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META18]] !kernel_arg_access_qual [[META19]] !kernel_arg_type [[META20]] !kernel_arg_base_type [[META20]] !kernel_arg_type_qual [[META21]] { // X86-NEXT: entry: -// X86-NEXT: [[IN_ADDR_I1:%.*]] = alloca ptr, align 4 -// X86-NEXT: [[OUT_ADDR_I2:%.*]] = alloca ptr, align 4 -// X86-NEXT: [[TMP_I3:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4 -// X86-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr, align 4 -// X86-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr, align 4 -// X86-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4 -// X86-NEXT: [[A_ADDR_I:%.*]] = alloca ptr, align 4 // X86-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4 // X86-NEXT: [[MAT3X3_ADDR:%.*]] = alloca ptr, align 4 // X86-NEXT: [[MAT4X4_ADDR:%.*]] = alloca ptr, align 4 @@ -493,35 +403,21 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // X86-NEXT: store ptr [[MAT32X32]], ptr [[MAT32X32_ADDR]], align 4 // X86-NEXT: store ptr [[MAT64X64]], ptr [[MAT64X64_ADDR]], align 4 // X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4 -// X86-NEXT: store ptr [[TMP0]], ptr [[A_ADDR_I]], align 4 -// X86-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR_I]], align 4 -// X86-NEXT: store i32 1, ptr [[TMP1]], align 4 -// X86-NEXT: [[TMP2:%.*]] = load ptr, ptr [[A_ADDR]], align 4 -// X86-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern(ptr noundef align 4 [[TMP2]]) #[[ATTR6]] -// X86-NEXT: [[TMP3:%.*]] = load ptr, ptr [[MAT3X3_ADDR]], align 4 -// X86-NEXT: [[TMP4:%.*]] = load ptr, ptr [[MAT4X4_ADDR]], align 4 -// X86-NEXT: store ptr [[TMP3]], ptr [[IN_ADDR_I]], align 4 -// X86-NEXT: store ptr [[TMP4]], ptr [[OUT_ADDR_I]], align 4 -// X86-NEXT: [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_I]], align 4 -// X86-NEXT: [[TMP6:%.*]] = load ptr, ptr [[IN_ADDR_I]], align 4 -// X86-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr [[TMP6]], i32 1 -// X86-NEXT: call void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP_I]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[ARRAYIDX1_I]]) #[[ATTR5]] -// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP5]], ptr align 4 [[TMP_I]], i32 64, i1 false) -// X86-NEXT: [[TMP7:%.*]] = load ptr, ptr [[MAT32X32_ADDR]], align 4 -// X86-NEXT: [[TMP8:%.*]] = load ptr, ptr [[MAT64X64_ADDR]], align 4 -// X86-NEXT: store ptr [[TMP7]], ptr [[IN_ADDR_I1]], align 4 -// X86-NEXT: store ptr [[TMP8]], ptr [[OUT_ADDR_I2]], align 4 -// X86-NEXT: [[TMP9:%.*]] = load ptr, ptr [[OUT_ADDR_I2]], align 4 -// X86-NEXT: [[TMP10:%.*]] = load ptr, ptr [[IN_ADDR_I1]], align 4 -// X86-NEXT: [[ARRAYIDX1_I4:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32:%.*]], ptr [[TMP10]], i32 1 -// X86-NEXT: call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_I3]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[ARRAYIDX1_I4]]) #[[ATTR5]] -// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP9]], ptr align 4 [[TMP_I3]], i32 16384, i1 false) -// X86-NEXT: [[TMP11:%.*]] = load ptr, ptr [[MAT3X3_ADDR]], align 4 -// X86-NEXT: [[TMP12:%.*]] = load ptr, ptr [[MAT4X4_ADDR]], align 4 -// X86-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern_Mat3X3(ptr noundef align 4 [[TMP11]], ptr noundef align 4 [[TMP12]]) #[[ATTR6]] -// X86-NEXT: [[TMP13:%.*]] = load ptr, ptr [[MAT32X32_ADDR]], align 4 -// X86-NEXT: [[TMP14:%.*]] = load ptr, ptr [[MAT64X64_ADDR]], align 4 -// X86-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern_Mat32X32(ptr noundef align 4 [[TMP13]], ptr noundef align 4 [[TMP14]]) #[[ATTR6]] +// X86-NEXT: call void @__clang_ocl_kern_imp_callee_kern(ptr noundef align 4 [[TMP0]]) #[[ATTR4]] +// X86-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 4 +// X86-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern(ptr noundef align 4 [[TMP1]]) #[[ATTR4]] +// X86-NEXT: [[TMP2:%.*]] = load ptr, ptr [[MAT3X3_ADDR]], align 4 +// X86-NEXT: [[TMP3:%.*]] = load ptr, ptr [[MAT4X4_ADDR]], align 4 +// X86-NEXT: call void @__clang_ocl_kern_imp_callee_kern_Mat3X3(ptr noundef align 4 [[TMP2]], ptr noundef align 4 [[TMP3]]) #[[ATTR4]] +// X86-NEXT: [[TMP4:%.*]] = load ptr, ptr [[MAT32X32_ADDR]], align 4 +// X86-NEXT: [[TMP5:%.*]] = load ptr, ptr [[MAT64X64_ADDR]], align 4 +// X86-NEXT: call void @__clang_ocl_kern_imp_callee_kern_Mat32X32(ptr noundef align 4 [[TMP4]], ptr noundef align 4 [[TMP5]]) #[[ATTR4]] +// X86-NEXT: [[TMP6:%.*]] = load ptr, ptr [[MAT3X3_ADDR]], align 4 +// X86-NEXT: [[TMP7:%.*]] = load ptr, ptr [[MAT4X4_ADDR]], align 4 +// X86-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern_Mat3X3(ptr noundef align 4 [[TMP6]], ptr noundef align 4 [[TMP7]]) #[[ATTR4]] +// X86-NEXT: [[TMP8:%.*]] = load ptr, ptr [[MAT32X32_ADDR]], align 4 +// X86-NEXT: [[TMP9:%.*]] = load ptr, ptr [[MAT64X64_ADDR]], align 4 +// X86-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern_Mat32X32(ptr noundef align 4 [[TMP8]], ptr noundef align 4 [[TMP9]]) #[[ATTR4]] // X86-NEXT: ret void // // @@ -529,56 +425,27 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // X86-LABEL: define spir_kernel void @caller_kern2( // X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 8 [[STRUCTONEMEM:%.*]], ptr noundef align 8 [[GLOBAL_STRUCTONEMEM:%.*]], ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[STRUCTTWOMEM:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META22:![0-9]+]] !kernel_arg_access_qual [[META23:![0-9]+]] !kernel_arg_type [[META24:![0-9]+]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25:![0-9]+]] { // X86-NEXT: entry: -// X86-NEXT: [[U_I4_I:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8 -// X86-NEXT: [[STRUCTTWOMEM3_I:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8 -// X86-NEXT: [[U_I_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8 -// X86-NEXT: [[STRUCTONEMEM2_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8 -// X86-NEXT: [[STRUCTONEMEM_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8 -// X86-NEXT: [[STRUCTTWOMEM_I:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8 -// X86-NEXT: [[GLOBAL_STRUCTONEMEM_ADDR_I:%.*]] = alloca ptr, align 4 -// X86-NEXT: [[STRUCTTWOMEM2:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8 -// X86-NEXT: [[STRUCTONEMEM1:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8 // X86-NEXT: [[GLOBAL_STRUCTONEMEM_ADDR:%.*]] = alloca ptr, align 4 // X86-NEXT: store ptr [[GLOBAL_STRUCTONEMEM]], ptr [[GLOBAL_STRUCTONEMEM_ADDR]], align 4 // X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[GLOBAL_STRUCTONEMEM_ADDR]], align 4 -// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[STRUCTTWOMEM2]], ptr align 1 [[STRUCTTWOMEM]], i64 16, i1 false) -// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[STRUCTONEMEM1]], ptr align 1 [[STRUCTONEMEM]], i64 8, i1 false) -// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[STRUCTONEMEM_I]], ptr align 4 [[STRUCTONEMEM1]], i32 8, i1 false) -// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[STRUCTTWOMEM_I]], ptr align 4 [[STRUCTTWOMEM2]], i32 16, i1 false) -// X86-NEXT: store ptr [[TMP0]], ptr [[GLOBAL_STRUCTONEMEM_ADDR_I]], align 4 -// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[STRUCTONEMEM2_I]], ptr align 1 [[STRUCTONEMEM_I]], i64 8, i1 false) -// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I_I]], ptr align 4 [[STRUCTONEMEM2_I]], i32 8, i1 false) -// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U_I_I]]) #[[ATTR5]] -// X86-NEXT: call void @__clang_ocl_kern_imp_ext_KernelOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[STRUCTONEMEM_I]]) #[[ATTR6]] -// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[STRUCTTWOMEM3_I]], ptr align 1 [[STRUCTTWOMEM_I]], i64 16, i1 false) -// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I4_I]], ptr align 4 [[STRUCTTWOMEM3_I]], i32 16, i1 false) -// X86-NEXT: call void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U_I4_I]]) #[[ATTR5]] -// X86-NEXT: call void @__clang_ocl_kern_imp_ext_KernelTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[STRUCTTWOMEM_I]]) #[[ATTR6]] +// X86-NEXT: call void @__clang_ocl_kern_imp_caller_kern2(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[STRUCTONEMEM]], ptr noundef align 8 [[TMP0]], ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[STRUCTTWOMEM]]) #[[ATTR4]] // X86-NEXT: ret void // // -// X86: Function Attrs: alwaysinline convergent norecurse nounwind +// X86: Function Attrs: convergent noinline norecurse nounwind optnone // X86-LABEL: define void @__clang_ocl_kern_imp_caller_kern2( -// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]], ptr noundef align 8 [[GLOBAL_STRUCTONEMEM:%.*]], ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 4 [[TMP1:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META22]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META24]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25]] { +// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]], ptr noundef align 8 [[GLOBAL_STRUCTONEMEM:%.*]], ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 4 [[TMP1:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META22]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META24]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25]] { // X86-NEXT: entry: -// X86-NEXT: [[U_I4:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8 -// X86-NEXT: [[STRUCTTWOMEM3:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8 -// X86-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8 -// X86-NEXT: [[STRUCTONEMEM2:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8 // X86-NEXT: [[STRUCTONEMEM:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8 // X86-NEXT: [[STRUCTTWOMEM:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8 // X86-NEXT: [[GLOBAL_STRUCTONEMEM_ADDR:%.*]] = alloca ptr, align 4 // X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[STRUCTONEMEM]], ptr align 4 [[TMP0]], i32 8, i1 false) // X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[STRUCTTWOMEM]], ptr align 4 [[TMP1]], i32 16, i1 false) // X86-NEXT: store ptr [[GLOBAL_STRUCTONEMEM]], ptr [[GLOBAL_STRUCTONEMEM_ADDR]], align 4 -// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[STRUCTONEMEM2]], ptr align 1 [[STRUCTONEMEM]], i64 8, i1 false) -// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I]], ptr align 4 [[STRUCTONEMEM2]], i32 8, i1 false) -// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U_I]]) #[[ATTR5]] -// X86-NEXT: call void @__clang_ocl_kern_imp_ext_KernelOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[STRUCTONEMEM]]) #[[ATTR6]] -// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[STRUCTTWOMEM3]], ptr align 1 [[STRUCTTWOMEM]], i64 16, i1 false) -// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I4]], ptr align 4 [[STRUCTTWOMEM3]], i32 16, i1 false) -// X86-NEXT: call void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U_I4]]) #[[ATTR5]] -// X86-NEXT: call void @__clang_ocl_kern_imp_ext_KernelTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[STRUCTTWOMEM]]) #[[ATTR6]] +// X86-NEXT: call void @__clang_ocl_kern_imp_KernelOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[STRUCTONEMEM]]) #[[ATTR4]] +// X86-NEXT: call void @__clang_ocl_kern_imp_ext_KernelOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[STRUCTONEMEM]]) #[[ATTR4]] +// X86-NEXT: call void @__clang_ocl_kern_imp_KernelTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[STRUCTTWOMEM]]) #[[ATTR4]] +// X86-NEXT: call void @__clang_ocl_kern_imp_ext_KernelTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[STRUCTTWOMEM]]) #[[ATTR4]] // X86-NEXT: ret void // // @@ -586,49 +453,22 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // X86-LABEL: define spir_kernel void @caller_kern3( // X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[LARGESTRUCTONEMEM:%.*]], ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[LARGESTRUCTTWOMEM:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META26:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META27:![0-9]+]] !kernel_arg_base_type [[META27]] !kernel_arg_type_qual [[META11]] { // X86-NEXT: entry: -// X86-NEXT: [[U_I4_I:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 -// X86-NEXT: [[LARGESTRUCTTWOMEM3_I:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 -// X86-NEXT: [[U_I_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8 -// X86-NEXT: [[LARGESTRUCTONEMEM2_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8 -// X86-NEXT: [[LARGESTRUCTONEMEM_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8 -// X86-NEXT: [[LARGESTRUCTTWOMEM_I:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 -// X86-NEXT: [[LARGESTRUCTTWOMEM2:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 -// X86-NEXT: [[LARGESTRUCTONEMEM1:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8 -// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[LARGESTRUCTTWOMEM2]], ptr align 1 [[LARGESTRUCTTWOMEM]], i64 480, i1 false) -// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[LARGESTRUCTONEMEM1]], ptr align 1 [[LARGESTRUCTONEMEM]], i64 800, i1 false) -// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[LARGESTRUCTONEMEM_I]], ptr align 4 [[LARGESTRUCTONEMEM1]], i32 800, i1 false) -// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[LARGESTRUCTTWOMEM_I]], ptr align 4 [[LARGESTRUCTTWOMEM2]], i32 480, i1 false) -// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[LARGESTRUCTONEMEM2_I]], ptr align 1 [[LARGESTRUCTONEMEM_I]], i64 800, i1 false) -// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I_I]], ptr align 4 [[LARGESTRUCTONEMEM2_I]], i32 800, i1 false) -// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U_I_I]]) #[[ATTR5]] -// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[LARGESTRUCTTWOMEM3_I]], ptr align 1 [[LARGESTRUCTTWOMEM_I]], i64 480, i1 false) -// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I4_I]], ptr align 4 [[LARGESTRUCTTWOMEM3_I]], i32 480, i1 false) -// X86-NEXT: call void @FuncLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[U_I4_I]]) #[[ATTR5]] -// X86-NEXT: call void @__clang_ocl_kern_imp_ext_KernelLargeOneMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[LARGESTRUCTONEMEM_I]]) #[[ATTR6]] -// X86-NEXT: call void @__clang_ocl_kern_imp_ext_KernelLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[LARGESTRUCTTWOMEM_I]]) #[[ATTR6]] +// X86-NEXT: call void @__clang_ocl_kern_imp_caller_kern3(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[LARGESTRUCTONEMEM]], ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[LARGESTRUCTTWOMEM]]) #[[ATTR4]] // X86-NEXT: ret void // // -// X86: Function Attrs: alwaysinline convergent norecurse nounwind +// X86: Function Attrs: convergent noinline norecurse nounwind optnone // X86-LABEL: define void @__clang_ocl_kern_imp_caller_kern3( -// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]], ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 4 [[TMP1:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META26]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META27]] !kernel_arg_base_type [[META27]] !kernel_arg_type_qual [[META11]] { +// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]], ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 4 [[TMP1:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META26]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META27]] !kernel_arg_base_type [[META27]] !kernel_arg_type_qual [[META11]] { // X86-NEXT: entry: -// X86-NEXT: [[U_I4:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 -// X86-NEXT: [[LARGESTRUCTTWOMEM3:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 -// X86-NEXT: [[U_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8 -// X86-NEXT: [[LARGESTRUCTONEMEM2:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8 // X86-NEXT: [[LARGESTRUCTONEMEM:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8 // X86-NEXT: [[LARGESTRUCTTWOMEM:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 // X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[LARGESTRUCTONEMEM]], ptr align 4 [[TMP0]], i32 800, i1 false) // X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[LARGESTRUCTTWOMEM]], ptr align 4 [[TMP1]], i32 480, i1 false) -// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[LARGESTRUCTONEMEM2]], ptr align 1 [[LARGESTRUCTONEMEM]], i64 800, i1 false) -// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I]], ptr align 4 [[LARGESTRUCTONEMEM2]], i32 800, i1 false) -// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U_I]]) #[[ATTR5]] -// X86-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[LARGESTRUCTTWOMEM3]], ptr align 1 [[LARGESTRUCTTWOMEM]], i64 480, i1 false) -// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U_I4]], ptr align 4 [[LARGESTRUCTTWOMEM3]], i32 480, i1 false) -// X86-NEXT: call void @FuncLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[U_I4]]) #[[ATTR5]] -// X86-NEXT: call void @__clang_ocl_kern_imp_ext_KernelLargeOneMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[LARGESTRUCTONEMEM]]) #[[ATTR6]] -// X86-NEXT: call void @__clang_ocl_kern_imp_ext_KernelLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[LARGESTRUCTTWOMEM]]) #[[ATTR6]] +// X86-NEXT: call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[LARGESTRUCTONEMEM]]) #[[ATTR4]] +// X86-NEXT: call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[LARGESTRUCTTWOMEM]]) #[[ATTR4]] +// X86-NEXT: call void @__clang_ocl_kern_imp_ext_KernelLargeOneMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[LARGESTRUCTONEMEM]]) #[[ATTR4]] +// X86-NEXT: call void @__clang_ocl_kern_imp_ext_KernelLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[LARGESTRUCTTWOMEM]]) #[[ATTR4]] // X86-NEXT: ret void // // @@ -719,17 +559,14 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // AMDGCN-LABEL: define dso_local amdgpu_kernel void @callee_kern( // AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[A:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { // AMDGCN-NEXT: entry: -// AMDGCN-NEXT: [[A_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8 // AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8 -// AMDGCN-NEXT: store ptr addrspace(1) [[TMP0]], ptr addrspace(5) [[A_ADDR_I]], align 8 -// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR_I]], align 8 -// AMDGCN-NEXT: store i32 1, ptr addrspace(1) [[TMP1]], align 4 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_callee_kern(ptr addrspace(1) noundef align 4 [[TMP0]]) #[[ATTR5:[0-9]+]] // AMDGCN-NEXT: ret void // // -// AMDGCN: Function Attrs: alwaysinline convergent norecurse nounwind +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone // AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_callee_kern( // AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[A:%.*]]) #[[ATTR3:[0-9]+]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] { // AMDGCN-NEXT: entry: @@ -744,29 +581,17 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // AMDGCN-LABEL: define dso_local amdgpu_kernel void @callee_kern_Mat3X3( // AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META8:![0-9]+]] !kernel_arg_access_qual [[META9:![0-9]+]] !kernel_arg_type [[META10:![0-9]+]] !kernel_arg_base_type [[META10]] !kernel_arg_type_qual [[META11:![0-9]+]] { // AMDGCN-NEXT: entry: -// AMDGCN-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) // AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 // AMDGCN-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 // AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 // AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 -// AMDGCN-NEXT: store ptr addrspace(1) [[TMP0]], ptr addrspace(5) [[IN_ADDR_I]], align 8 -// AMDGCN-NEXT: store ptr addrspace(1) [[TMP1]], ptr addrspace(5) [[OUT_ADDR_I]], align 8 -// AMDGCN-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR_I]], align 8 -// AMDGCN-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR_I]], align 8 -// AMDGCN-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP3]], i64 1 -// AMDGCN-NEXT: [[TMP4:%.*]] = load [9 x i32], ptr addrspace(1) [[ARRAYIDX1_I]], align 4 -// AMDGCN-NEXT: [[CALL_I:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP4]]) #[[ATTR5:[0-9]+]] -// AMDGCN-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL_I]], 0 -// AMDGCN-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP_I]], align 4 -// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[TMP2]], ptr addrspace(5) align 4 [[TMP_I]], i64 64, i1 false) +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_callee_kern_Mat3X3(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR5]] // AMDGCN-NEXT: ret void // // -// AMDGCN: Function Attrs: alwaysinline convergent norecurse nounwind +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone // AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_callee_kern_Mat3X3( // AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META8]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META10]] !kernel_arg_base_type [[META10]] !kernel_arg_type_qual [[META11]] { // AMDGCN-NEXT: entry: @@ -781,7 +606,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // AMDGCN-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1 // AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0 // AMDGCN-NEXT: [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4 -// AMDGCN-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO]]([9 x i32] [[TMP3]]) #[[ATTR5]] +// AMDGCN-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR5]] // AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0 // AMDGCN-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0 // AMDGCN-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4 @@ -793,28 +618,17 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // AMDGCN-LABEL: define dso_local amdgpu_kernel void @callee_kern_Mat32X32( // AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META8]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META11]] { // AMDGCN-NEXT: entry: -// AMDGCN-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) -// AMDGCN-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) // AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 // AMDGCN-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 // AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 // AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 -// AMDGCN-NEXT: store ptr addrspace(1) [[TMP0]], ptr addrspace(5) [[IN_ADDR_I]], align 8 -// AMDGCN-NEXT: store ptr addrspace(1) [[TMP1]], ptr addrspace(5) [[OUT_ADDR_I]], align 8 -// AMDGCN-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR_I]], align 8 -// AMDGCN-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR_I]], align 8 -// AMDGCN-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP3]], i64 1 -// AMDGCN-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 4 [[ARRAYIDX1_I]], i64 4096, i1 false) -// AMDGCN-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_I]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR5]] -// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[TMP2]], ptr addrspace(5) align 4 [[TMP_I]], i64 16384, i1 false) +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_callee_kern_Mat32X32(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR5]] // AMDGCN-NEXT: ret void // // -// AMDGCN: Function Attrs: alwaysinline convergent norecurse nounwind +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone // AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_callee_kern_Mat32X32( // AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META8]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META11]] { // AMDGCN-NEXT: entry: @@ -838,19 +652,16 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMember( // AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13:![0-9]+]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META14:![0-9]+]] !kernel_arg_base_type [[META14]] !kernel_arg_type_qual [[META7]] { // AMDGCN-NEXT: entry: -// AMDGCN-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) // AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 // AMDGCN-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 // AMDGCN-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 // AMDGCN-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8 -// AMDGCN-NEXT: store <2 x i32> [[TMP0]], ptr addrspace(5) [[U_I]], align 8 -// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[U_I]], align 8 -// AMDGCN-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR5]] +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelOneMember(<2 x i32> [[TMP0]]) #[[ATTR5]] // AMDGCN-NEXT: ret void // // -// AMDGCN: Function Attrs: alwaysinline convergent norecurse nounwind +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone // AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelOneMember( // AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META14]] !kernel_arg_base_type [[META14]] !kernel_arg_type_qual [[META7]] { // AMDGCN-NEXT: entry: @@ -867,17 +678,15 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember( // AMDGCN-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META7]] { // AMDGCN-NEXT: entry: -// AMDGCN-NEXT: [[U_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 // AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 // AMDGCN-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 -// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U_I]], ptr addrspace(5) align 8 [[U]], i64 800, i1 false) -// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U_I]]) #[[ATTR5]] +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR5]] // AMDGCN-NEXT: ret void // // -// AMDGCN: Function Attrs: alwaysinline convergent norecurse nounwind +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone // AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelLargeOneMember( // AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META15]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META7]] { // AMDGCN-NEXT: entry: @@ -891,7 +700,6 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember( // AMDGCN-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META7]] { // AMDGCN-NEXT: entry: -// AMDGCN-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 // AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0 @@ -903,17 +711,11 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // AMDGCN-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8 // AMDGCN-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 // AMDGCN-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8 -// AMDGCN-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(5) [[U_I]], align 8 -// AMDGCN-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U_I]], i32 0, i32 1 -// AMDGCN-NEXT: store <2 x i32> [[TMP7]], ptr addrspace(5) [[TMP8]], align 8 -// AMDGCN-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr addrspace(5) [[U_I]], align 8 -// AMDGCN-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U_I]], i32 0, i32 1 -// AMDGCN-NEXT: [[TMP11:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP10]], align 8 -// AMDGCN-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP9]], <2 x i32> [[TMP11]]) #[[ATTR5]] +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR5]] // AMDGCN-NEXT: ret void // // -// AMDGCN: Function Attrs: alwaysinline convergent norecurse nounwind +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone // AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelTwoMember( // AMDGCN-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META16]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META7]] { // AMDGCN-NEXT: entry: @@ -934,7 +736,6 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember( // AMDGCN-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META7]] { // AMDGCN-NEXT: entry: -// AMDGCN-NEXT: [[U_I:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 // AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0 @@ -942,12 +743,11 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 // AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1 // AMDGCN-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 -// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U_I]], ptr addrspace(5) align 8 [[U]], i64 480, i1 false) -// AMDGCN-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U_I]]) #[[ATTR5]] +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR5]] // AMDGCN-NEXT: ret void // // -// AMDGCN: Function Attrs: alwaysinline convergent norecurse nounwind +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone // AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelLargeTwoMember( // AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META17]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META7]] { // AMDGCN-NEXT: entry: @@ -961,19 +761,6 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // AMDGCN-LABEL: define dso_local amdgpu_kernel void @caller_kern( // AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[A:%.*]], ptr addrspace(1) noundef align 4 [[MAT3X3:%.*]], ptr addrspace(1) noundef align 4 [[MAT4X4:%.*]], ptr addrspace(1) noundef align 4 [[MAT32X32:%.*]], ptr addrspace(1) noundef align 4 [[MAT64X64:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META18:![0-9]+]] !kernel_arg_access_qual [[META19:![0-9]+]] !kernel_arg_type [[META20:![0-9]+]] !kernel_arg_base_type [[META20]] !kernel_arg_type_qual [[META21:![0-9]+]] { // AMDGCN-NEXT: entry: -// AMDGCN-NEXT: [[IN_ADDR_I1_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: [[OUT_ADDR_I2_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: [[TMP_I3_I:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) -// AMDGCN-NEXT: [[BYVAL_TEMP_I_I:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) -// AMDGCN-NEXT: [[IN_ADDR_I_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: [[OUT_ADDR_I_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: [[TMP_I_I:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) -// AMDGCN-NEXT: [[A_ADDR_I_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: [[A_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: [[MAT3X3_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: [[MAT4X4_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: [[MAT32X32_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: [[MAT64X64_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: [[MAT3X3_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: [[MAT4X4_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) @@ -989,60 +776,14 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // AMDGCN-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT4X4_ADDR]], align 8 // AMDGCN-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT32X32_ADDR]], align 8 // AMDGCN-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT64X64_ADDR]], align 8 -// AMDGCN-NEXT: store ptr addrspace(1) [[TMP0]], ptr addrspace(5) [[A_ADDR_I]], align 8 -// AMDGCN-NEXT: store ptr addrspace(1) [[TMP1]], ptr addrspace(5) [[MAT3X3_ADDR_I]], align 8 -// AMDGCN-NEXT: store ptr addrspace(1) [[TMP2]], ptr addrspace(5) [[MAT4X4_ADDR_I]], align 8 -// AMDGCN-NEXT: store ptr addrspace(1) [[TMP3]], ptr addrspace(5) [[MAT32X32_ADDR_I]], align 8 -// AMDGCN-NEXT: store ptr addrspace(1) [[TMP4]], ptr addrspace(5) [[MAT64X64_ADDR_I]], align 8 -// AMDGCN-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR_I]], align 8 -// AMDGCN-NEXT: store ptr addrspace(1) [[TMP5]], ptr addrspace(5) [[A_ADDR_I_I]], align 8 -// AMDGCN-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR_I_I]], align 8 -// AMDGCN-NEXT: store i32 1, ptr addrspace(1) [[TMP6]], align 4 -// AMDGCN-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR_I]], align 8 -// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern(ptr addrspace(1) noundef align 4 [[TMP7]]) #[[ATTR6:[0-9]+]] -// AMDGCN-NEXT: [[TMP8:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT3X3_ADDR_I]], align 8 -// AMDGCN-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT4X4_ADDR_I]], align 8 -// AMDGCN-NEXT: store ptr addrspace(1) [[TMP8]], ptr addrspace(5) [[IN_ADDR_I_I]], align 8 -// AMDGCN-NEXT: store ptr addrspace(1) [[TMP9]], ptr addrspace(5) [[OUT_ADDR_I_I]], align 8 -// AMDGCN-NEXT: [[TMP10:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR_I_I]], align 8 -// AMDGCN-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR_I_I]], align 8 -// AMDGCN-NEXT: [[ARRAYIDX1_I_I:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP11]], i64 1 -// AMDGCN-NEXT: [[TMP12:%.*]] = load [9 x i32], ptr addrspace(1) [[ARRAYIDX1_I_I]], align 4 -// AMDGCN-NEXT: [[CALL_I_I:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO]]([9 x i32] [[TMP12]]) #[[ATTR5]] -// AMDGCN-NEXT: [[TMP13:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL_I_I]], 0 -// AMDGCN-NEXT: store [16 x i32] [[TMP13]], ptr addrspace(5) [[TMP_I_I]], align 4 -// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[TMP10]], ptr addrspace(5) align 4 [[TMP_I_I]], i64 64, i1 false) -// AMDGCN-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT32X32_ADDR_I]], align 8 -// AMDGCN-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT64X64_ADDR_I]], align 8 -// AMDGCN-NEXT: store ptr addrspace(1) [[TMP14]], ptr addrspace(5) [[IN_ADDR_I1_I]], align 8 -// AMDGCN-NEXT: store ptr addrspace(1) [[TMP15]], ptr addrspace(5) [[OUT_ADDR_I2_I]], align 8 -// AMDGCN-NEXT: [[TMP16:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR_I2_I]], align 8 -// AMDGCN-NEXT: [[TMP17:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR_I1_I]], align 8 -// AMDGCN-NEXT: [[ARRAYIDX1_I4_I:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP17]], i64 1 -// AMDGCN-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP_I_I]], ptr addrspace(1) align 4 [[ARRAYIDX1_I4_I]], i64 4096, i1 false) -// AMDGCN-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_I3_I]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP_I_I]]) #[[ATTR5]] -// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[TMP16]], ptr addrspace(5) align 4 [[TMP_I3_I]], i64 16384, i1 false) -// AMDGCN-NEXT: [[TMP18:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT3X3_ADDR_I]], align 8 -// AMDGCN-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT4X4_ADDR_I]], align 8 -// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern_Mat3X3(ptr addrspace(1) noundef align 4 [[TMP18]], ptr addrspace(1) noundef align 4 [[TMP19]]) #[[ATTR6]] -// AMDGCN-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT32X32_ADDR_I]], align 8 -// AMDGCN-NEXT: [[TMP21:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT64X64_ADDR_I]], align 8 -// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern_Mat32X32(ptr addrspace(1) noundef align 4 [[TMP20]], ptr addrspace(1) noundef align 4 [[TMP21]]) #[[ATTR6]] +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_caller_kern(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]], ptr addrspace(1) noundef align 4 [[TMP2]], ptr addrspace(1) noundef align 4 [[TMP3]], ptr addrspace(1) noundef align 4 [[TMP4]]) #[[ATTR5]] // AMDGCN-NEXT: ret void // // -// AMDGCN: Function Attrs: alwaysinline convergent norecurse nounwind +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone // AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_caller_kern( // AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[A:%.*]], ptr addrspace(1) noundef align 4 [[MAT3X3:%.*]], ptr addrspace(1) noundef align 4 [[MAT4X4:%.*]], ptr addrspace(1) noundef align 4 [[MAT32X32:%.*]], ptr addrspace(1) noundef align 4 [[MAT64X64:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META18]] !kernel_arg_access_qual [[META19]] !kernel_arg_type [[META20]] !kernel_arg_base_type [[META20]] !kernel_arg_type_qual [[META21]] { // AMDGCN-NEXT: entry: -// AMDGCN-NEXT: [[IN_ADDR_I1:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: [[OUT_ADDR_I2:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: [[TMP_I3:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) -// AMDGCN-NEXT: [[BYVAL_TEMP_I:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) -// AMDGCN-NEXT: [[IN_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: [[OUT_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: [[TMP_I:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) -// AMDGCN-NEXT: [[A_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: [[MAT3X3_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: [[MAT4X4_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) @@ -1054,39 +795,21 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // AMDGCN-NEXT: store ptr addrspace(1) [[MAT32X32]], ptr addrspace(5) [[MAT32X32_ADDR]], align 8 // AMDGCN-NEXT: store ptr addrspace(1) [[MAT64X64]], ptr addrspace(5) [[MAT64X64_ADDR]], align 8 // AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8 -// AMDGCN-NEXT: store ptr addrspace(1) [[TMP0]], ptr addrspace(5) [[A_ADDR_I]], align 8 -// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR_I]], align 8 -// AMDGCN-NEXT: store i32 1, ptr addrspace(1) [[TMP1]], align 4 -// AMDGCN-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8 -// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern(ptr addrspace(1) noundef align 4 [[TMP2]]) #[[ATTR6]] -// AMDGCN-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT3X3_ADDR]], align 8 -// AMDGCN-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT4X4_ADDR]], align 8 -// AMDGCN-NEXT: store ptr addrspace(1) [[TMP3]], ptr addrspace(5) [[IN_ADDR_I]], align 8 -// AMDGCN-NEXT: store ptr addrspace(1) [[TMP4]], ptr addrspace(5) [[OUT_ADDR_I]], align 8 -// AMDGCN-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR_I]], align 8 -// AMDGCN-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR_I]], align 8 -// AMDGCN-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP6]], i64 1 -// AMDGCN-NEXT: [[TMP7:%.*]] = load [9 x i32], ptr addrspace(1) [[ARRAYIDX1_I]], align 4 -// AMDGCN-NEXT: [[CALL_I:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO]]([9 x i32] [[TMP7]]) #[[ATTR5]] -// AMDGCN-NEXT: [[TMP8:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL_I]], 0 -// AMDGCN-NEXT: store [16 x i32] [[TMP8]], ptr addrspace(5) [[TMP_I]], align 4 -// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[TMP5]], ptr addrspace(5) align 4 [[TMP_I]], i64 64, i1 false) -// AMDGCN-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT32X32_ADDR]], align 8 -// AMDGCN-NEXT: [[TMP10:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT64X64_ADDR]], align 8 -// AMDGCN-NEXT: store ptr addrspace(1) [[TMP9]], ptr addrspace(5) [[IN_ADDR_I1]], align 8 -// AMDGCN-NEXT: store ptr addrspace(1) [[TMP10]], ptr addrspace(5) [[OUT_ADDR_I2]], align 8 -// AMDGCN-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR_I2]], align 8 -// AMDGCN-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR_I1]], align 8 -// AMDGCN-NEXT: [[ARRAYIDX1_I4:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP12]], i64 1 -// AMDGCN-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP_I]], ptr addrspace(1) align 4 [[ARRAYIDX1_I4]], i64 4096, i1 false) -// AMDGCN-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_I3]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP_I]]) #[[ATTR5]] -// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[TMP11]], ptr addrspace(5) align 4 [[TMP_I3]], i64 16384, i1 false) -// AMDGCN-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT3X3_ADDR]], align 8 -// AMDGCN-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT4X4_ADDR]], align 8 -// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern_Mat3X3(ptr addrspace(1) noundef align 4 [[TMP13]], ptr addrspace(1) noundef align 4 [[TMP14]]) #[[ATTR6]] -// AMDGCN-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT32X32_ADDR]], align 8 -// AMDGCN-NEXT: [[TMP16:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT64X64_ADDR]], align 8 -// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern_Mat32X32(ptr addrspace(1) noundef align 4 [[TMP15]], ptr addrspace(1) noundef align 4 [[TMP16]]) #[[ATTR6]] +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_callee_kern(ptr addrspace(1) noundef align 4 [[TMP0]]) #[[ATTR5]] +// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern(ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR5]] +// AMDGCN-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT3X3_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT4X4_ADDR]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_callee_kern_Mat3X3(ptr addrspace(1) noundef align 4 [[TMP2]], ptr addrspace(1) noundef align 4 [[TMP3]]) #[[ATTR5]] +// AMDGCN-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT32X32_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT64X64_ADDR]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_callee_kern_Mat32X32(ptr addrspace(1) noundef align 4 [[TMP4]], ptr addrspace(1) noundef align 4 [[TMP5]]) #[[ATTR5]] +// AMDGCN-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT3X3_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT4X4_ADDR]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern_Mat3X3(ptr addrspace(1) noundef align 4 [[TMP6]], ptr addrspace(1) noundef align 4 [[TMP7]]) #[[ATTR5]] +// AMDGCN-NEXT: [[TMP8:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT32X32_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT64X64_ADDR]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern_Mat32X32(ptr addrspace(1) noundef align 4 [[TMP8]], ptr addrspace(1) noundef align 4 [[TMP9]]) #[[ATTR5]] // AMDGCN-NEXT: ret void // // @@ -1094,12 +817,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // AMDGCN-LABEL: define dso_local amdgpu_kernel void @caller_kern2( // AMDGCN-SAME: <2 x i32> [[STRUCTONEMEM_COERCE:%.*]], ptr addrspace(1) noundef align 8 [[GLOBAL_STRUCTONEMEM:%.*]], [[STRUCT_STRUCTTWOMEMBER:%.*]] [[STRUCTTWOMEM_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META22:![0-9]+]] !kernel_arg_access_qual [[META23:![0-9]+]] !kernel_arg_type [[META24:![0-9]+]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25:![0-9]+]] { // AMDGCN-NEXT: entry: -// AMDGCN-NEXT: [[U_I3_I:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[U_I_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN-NEXT: [[STRUCTONEMEM_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[STRUCTTWOMEM_I:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[GLOBAL_STRUCTONEMEM_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: [[STRUCTONEMEM:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[STRUCTONEMEM:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) // AMDGCN-NEXT: [[STRUCTTWOMEM:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[GLOBAL_STRUCTONEMEM_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[STRUCTONEMEM]], i32 0, i32 0 @@ -1118,42 +836,16 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // AMDGCN-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8 // AMDGCN-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 1 // AMDGCN-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP8]], align 8 -// AMDGCN-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(5) [[STRUCTONEMEM_I]], align 8 -// AMDGCN-NEXT: store <2 x i32> [[TMP7]], ptr addrspace(5) [[STRUCTTWOMEM_I]], align 8 -// AMDGCN-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM_I]], i32 0, i32 1 -// AMDGCN-NEXT: store <2 x i32> [[TMP9]], ptr addrspace(5) [[TMP10]], align 8 -// AMDGCN-NEXT: store ptr addrspace(1) [[TMP4]], ptr addrspace(5) [[GLOBAL_STRUCTONEMEM_ADDR_I]], align 8 -// AMDGCN-NEXT: [[TMP11:%.*]] = load <2 x i32>, ptr addrspace(5) [[STRUCTONEMEM_I]], align 8 -// AMDGCN-NEXT: store <2 x i32> [[TMP11]], ptr addrspace(5) [[U_I_I]], align 8 -// AMDGCN-NEXT: [[TMP12:%.*]] = load <2 x i32>, ptr addrspace(5) [[U_I_I]], align 8 -// AMDGCN-NEXT: call void @FuncOneMember(<2 x i32> [[TMP12]]) #[[ATTR5]] -// AMDGCN-NEXT: [[TMP13:%.*]] = load <2 x i32>, ptr addrspace(5) [[STRUCTONEMEM_I]], align 8 -// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_KernelOneMember(<2 x i32> [[TMP13]]) #[[ATTR6]] -// AMDGCN-NEXT: [[TMP14:%.*]] = load <2 x i32>, ptr addrspace(5) [[STRUCTTWOMEM_I]], align 8 -// AMDGCN-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM_I]], i32 0, i32 1 -// AMDGCN-NEXT: [[TMP16:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP15]], align 8 -// AMDGCN-NEXT: store <2 x i32> [[TMP14]], ptr addrspace(5) [[U_I3_I]], align 8 -// AMDGCN-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U_I3_I]], i32 0, i32 1 -// AMDGCN-NEXT: store <2 x i32> [[TMP16]], ptr addrspace(5) [[TMP17]], align 8 -// AMDGCN-NEXT: [[TMP18:%.*]] = load <2 x i32>, ptr addrspace(5) [[U_I3_I]], align 8 -// AMDGCN-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U_I3_I]], i32 0, i32 1 -// AMDGCN-NEXT: [[TMP20:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP19]], align 8 -// AMDGCN-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP18]], <2 x i32> [[TMP20]]) #[[ATTR5]] -// AMDGCN-NEXT: [[TMP21:%.*]] = load <2 x i32>, ptr addrspace(5) [[STRUCTTWOMEM_I]], align 8 -// AMDGCN-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM_I]], i32 0, i32 1 -// AMDGCN-NEXT: [[TMP23:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP22]], align 8 -// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_KernelTwoMember(<2 x i32> [[TMP21]], <2 x i32> [[TMP23]]) #[[ATTR6]] +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_caller_kern2(<2 x i32> [[TMP5]], ptr addrspace(1) noundef align 8 [[TMP4]], <2 x i32> [[TMP7]], <2 x i32> [[TMP9]]) #[[ATTR5]] // AMDGCN-NEXT: ret void // // -// AMDGCN: Function Attrs: alwaysinline convergent norecurse nounwind +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone // AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_caller_kern2( // AMDGCN-SAME: <2 x i32> [[STRUCTONEMEM_COERCE:%.*]], ptr addrspace(1) noundef align 8 [[GLOBAL_STRUCTONEMEM:%.*]], <2 x i32> [[STRUCTTWOMEM_COERCE0:%.*]], <2 x i32> [[STRUCTTWOMEM_COERCE1:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META22]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META24]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25]] { // AMDGCN-NEXT: entry: -// AMDGCN-NEXT: [[U_I3:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN-NEXT: [[U_I:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN-NEXT: [[STRUCTONEMEM:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[STRUCTTWOMEM:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[STRUCTONEMEM:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[STRUCTTWOMEM:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5) // AMDGCN-NEXT: [[GLOBAL_STRUCTONEMEM_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[STRUCTONEMEM]], i32 0, i32 0 // AMDGCN-NEXT: store <2 x i32> [[STRUCTONEMEM_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 @@ -1164,28 +856,20 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // AMDGCN-NEXT: store ptr addrspace(1) [[GLOBAL_STRUCTONEMEM]], ptr addrspace(5) [[GLOBAL_STRUCTONEMEM_ADDR]], align 8 // AMDGCN-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[STRUCTONEMEM]], i32 0, i32 0 // AMDGCN-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8 -// AMDGCN-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(5) [[U_I]], align 8 -// AMDGCN-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr addrspace(5) [[U_I]], align 8 -// AMDGCN-NEXT: call void @FuncOneMember(<2 x i32> [[TMP3]]) #[[ATTR5]] +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelOneMember(<2 x i32> [[TMP2]]) #[[ATTR5]] // AMDGCN-NEXT: [[COERCE_DIVE2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[STRUCTONEMEM]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE2]], align 8 -// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_KernelOneMember(<2 x i32> [[TMP4]]) #[[ATTR6]] -// AMDGCN-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP5]], align 8 -// AMDGCN-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 1 -// AMDGCN-NEXT: [[TMP8:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP7]], align 8 -// AMDGCN-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(5) [[U_I3]], align 8 -// AMDGCN-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U_I3]], i32 0, i32 1 -// AMDGCN-NEXT: store <2 x i32> [[TMP8]], ptr addrspace(5) [[TMP9]], align 8 -// AMDGCN-NEXT: [[TMP10:%.*]] = load <2 x i32>, ptr addrspace(5) [[U_I3]], align 8 -// AMDGCN-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U_I3]], i32 0, i32 1 -// AMDGCN-NEXT: [[TMP12:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP11]], align 8 -// AMDGCN-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP10]], <2 x i32> [[TMP12]]) #[[ATTR5]] -// AMDGCN-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP14:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP13]], align 8 -// AMDGCN-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 1 -// AMDGCN-NEXT: [[TMP16:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP15]], align 8 -// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_KernelTwoMember(<2 x i32> [[TMP14]], <2 x i32> [[TMP16]]) #[[ATTR6]] +// AMDGCN-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE2]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_KernelOneMember(<2 x i32> [[TMP3]]) #[[ATTR5]] +// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8 +// AMDGCN-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR5]] +// AMDGCN-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP8]], align 8 +// AMDGCN-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP11:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP10]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_KernelTwoMember(<2 x i32> [[TMP9]], <2 x i32> [[TMP11]]) #[[ATTR5]] // AMDGCN-NEXT: ret void // // @@ -1193,10 +877,6 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // AMDGCN-LABEL: define dso_local amdgpu_kernel void @caller_kern3( // AMDGCN-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[LARGESTRUCTONEMEM_COERCE:%.*]], [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[LARGESTRUCTTWOMEM_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META26:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META27:![0-9]+]] !kernel_arg_base_type [[META27]] !kernel_arg_type_qual [[META11]] { // AMDGCN-NEXT: entry: -// AMDGCN-NEXT: [[U_I2_I:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[U_I_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[LARGESTRUCTONEMEM_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[LARGESTRUCTTWOMEM_I:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[LARGESTRUCTONEMEM:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[LARGESTRUCTTWOMEM:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[LARGESTRUCTONEMEM]], i32 0, i32 0 @@ -1208,33 +888,22 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[LARGESTRUCTTWOMEM]], i32 0, i32 1 // AMDGCN-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[LARGESTRUCTTWOMEM_COERCE]], 1 // AMDGCN-NEXT: store [20 x <2 x i32>] [[TMP5]], ptr addrspace(5) [[TMP4]], align 8 -// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[LARGESTRUCTONEMEM_I]], ptr addrspace(5) align 8 [[LARGESTRUCTONEMEM]], i64 800, i1 false) -// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[LARGESTRUCTTWOMEM_I]], ptr addrspace(5) align 8 [[LARGESTRUCTTWOMEM]], i64 480, i1 false) -// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U_I_I]], ptr addrspace(5) align 8 [[LARGESTRUCTONEMEM_I]], i64 800, i1 false) -// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U_I_I]]) #[[ATTR5]] -// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U_I2_I]], ptr addrspace(5) align 8 [[LARGESTRUCTTWOMEM_I]], i64 480, i1 false) -// AMDGCN-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U_I2_I]]) #[[ATTR5]] -// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[LARGESTRUCTONEMEM_I]]) #[[ATTR6]] -// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[LARGESTRUCTTWOMEM_I]]) #[[ATTR6]] +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_caller_kern3(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[LARGESTRUCTONEMEM]], ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[LARGESTRUCTTWOMEM]]) #[[ATTR5]] // AMDGCN-NEXT: ret void // // -// AMDGCN: Function Attrs: alwaysinline convergent norecurse nounwind +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone // AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_caller_kern3( // AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]], ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP1:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META26]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META27]] !kernel_arg_base_type [[META27]] !kernel_arg_type_qual [[META11]] { // AMDGCN-NEXT: entry: -// AMDGCN-NEXT: [[U_I2:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[U_I:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[LARGESTRUCTONEMEM:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[LARGESTRUCTTWOMEM:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[LARGESTRUCTONEMEM]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) // AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[LARGESTRUCTTWOMEM]], ptr addrspace(5) align 8 [[TMP1]], i64 480, i1 false) -// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U_I]], ptr addrspace(5) align 8 [[LARGESTRUCTONEMEM]], i64 800, i1 false) -// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U_I]]) #[[ATTR5]] -// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U_I2]], ptr addrspace(5) align 8 [[LARGESTRUCTTWOMEM]], i64 480, i1 false) -// AMDGCN-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U_I2]]) #[[ATTR5]] -// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[LARGESTRUCTONEMEM]]) #[[ATTR6]] -// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[LARGESTRUCTTWOMEM]]) #[[ATTR6]] +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[LARGESTRUCTONEMEM]]) #[[ATTR5]] +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[LARGESTRUCTTWOMEM]]) #[[ATTR5]] +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[LARGESTRUCTONEMEM]]) #[[ATTR5]] +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[LARGESTRUCTTWOMEM]]) #[[ATTR5]] // AMDGCN-NEXT: ret void // //. diff --git a/clang/test/CodeGenOpenCL/reflect.cl b/clang/test/CodeGenOpenCL/reflect.cl index fe43187d6a3f1..4abb40aa3ed50 100644 --- a/clang/test/CodeGenOpenCL/reflect.cl +++ b/clang/test/CodeGenOpenCL/reflect.cl @@ -1,6 +1,13 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 4 // RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -emit-llvm -O0 -o - | FileCheck %s +bool device_function() { + return __nvvm_reflect("__CUDA_ARCH") >= 700; +} + +__kernel void kernel_function(__global int *i) { + *i = device_function(); +} // CHECK-LABEL: define dso_local zeroext i1 @device_function( // CHECK-SAME: ) #[[ATTR0:[0-9]+]] { // CHECK-NEXT: entry: @@ -8,27 +15,28 @@ // CHECK-NEXT: [[CMP:%.*]] = icmp uge i32 [[TMP0]], 700 // CHECK-NEXT: ret i1 [[CMP]] // -bool device_function() { - return __nvvm_reflect("__CUDA_ARCH") >= 700; -} - +// // CHECK-LABEL: define dso_local ptx_kernel void @kernel_function( // CHECK-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META5]] !kernel_arg_type_qual [[META6:![0-9]+]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[I_ADDR_I:%.*]] = alloca ptr addrspace(1), align 4 // CHECK-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 4 // CHECK-NEXT: store ptr addrspace(1) [[I]], ptr [[I_ADDR]], align 4 // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[I_ADDR]], align 4 -// CHECK-NEXT: store ptr addrspace(1) [[TMP0]], ptr [[I_ADDR_I]], align 4 -// CHECK-NEXT: [[CALL_I:%.*]] = call zeroext i1 @device_function() #[[ATTR4:[0-9]+]] -// CHECK-NEXT: [[CONV_I:%.*]] = zext i1 [[CALL_I]] to i32 -// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[I_ADDR_I]], align 4 -// CHECK-NEXT: store i32 [[CONV_I]], ptr addrspace(1) [[TMP1]], align 4 +// CHECK-NEXT: call void @__clang_ocl_kern_imp_kernel_function(ptr addrspace(1) noundef align 4 [[TMP0]]) #[[ATTR3:[0-9]+]] +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define dso_local ptx_kernel void @__clang_ocl_kern_imp_kernel_function( +// CHECK-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META5]] !kernel_arg_base_type [[META5]] !kernel_arg_type_qual [[META6]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// CHECK-NEXT: store ptr addrspace(1) [[I]], ptr [[I_ADDR]], align 4 +// CHECK-NEXT: [[CALL:%.*]] = call zeroext i1 @device_function() #[[ATTR3]] +// CHECK-NEXT: [[CONV:%.*]] = zext i1 [[CALL]] to i32 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[I_ADDR]], align 4 +// CHECK-NEXT: store i32 [[CONV]], ptr addrspace(1) [[TMP0]], align 4 // CHECK-NEXT: ret void // -__kernel void kernel_function(__global int *i) { - *i = device_function(); -} //. // CHECK: [[META3]] = !{i32 1} // CHECK: [[META4]] = !{!"none"} diff --git a/clang/test/CodeGenOpenCL/sampler.cl b/clang/test/CodeGenOpenCL/sampler.cl index 3649afba50fe2..227242be73c5e 100644 --- a/clang/test/CodeGenOpenCL/sampler.cl +++ b/clang/test/CodeGenOpenCL/sampler.cl @@ -36,8 +36,12 @@ void fnc4smp(sampler_t s) {} kernel void foo(sampler_t smp_par) { // CHECK-SPIR-LABEL: define{{.*}} spir_kernel void @foo(target("spirv.Sampler") %smp_par) + // CHECK-SPIR: call spir_func void @__clang_ocl_kern_imp_foo(target("spirv.Sampler") %0) + // CHECK-SPIR-LABEL: define{{.*}} spir_func void @__clang_ocl_kern_imp_foo(target("spirv.Sampler") %smp_par) // CHECK-SPIR: [[smp_par_ptr:%[A-Za-z0-9_\.]+]] = alloca target("spirv.Sampler") // CHECK-X86-LABEL: define{{.*}} spir_kernel void @foo(ptr %smp_par) + // CHECK-X86: call void @__clang_ocl_kern_imp_foo(ptr %0) + // CHECK-X86-LABEL: define{{.*}} void @__clang_ocl_kern_imp_foo(ptr %smp_par) // CHECK-X86: [[smp_par_ptr:%[A-Za-z0-9_\.]+]] = alloca ptr // Case 2b diff --git a/clang/test/CodeGenOpenCL/spir-calling-conv.cl b/clang/test/CodeGenOpenCL/spir-calling-conv.cl index b2015ea0f729d..56e1b662a7b4e 100644 --- a/clang/test/CodeGenOpenCL/spir-calling-conv.cl +++ b/clang/test/CodeGenOpenCL/spir-calling-conv.cl @@ -4,8 +4,11 @@ int get_dummy_id(int D); kernel void bar(global int *A); +//CHECK: define{{.*}} spir_kernel void @foo(ptr addrspace(1) noundef align 4 %A) +//CHECK: tail call spir_func void @__clang_ocl_kern_imp_bar(ptr addrspace(1) noundef align 4 %A) + kernel void foo(global int *A) -// CHECK: define{{.*}} spir_kernel void @foo(ptr addrspace(1) noundef align 4 %A) +// CHECK: define{{.*}} spir_func void @__clang_ocl_kern_imp_foo(ptr addrspace(1) noundef align 4 %A) { int id = get_dummy_id(0); // CHECK: %{{[a-z0-9_]+}} = tail call spir_func i32 @get_dummy_id(i32 noundef 0) diff --git a/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl b/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl index f15130d5f8b61..9331afc305c25 100644 --- a/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl +++ b/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl @@ -1,5 +1,5 @@ // REQUIRES: amdgpu-registered-target -// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx908 -Rpass-analysis=kernel-resource-usage -S -O0 -verify %s -o /dev/null +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx908 -Rpass-analysis=kernel-resource-usage -S -O1 -verify %s -o /dev/null // expected-remark@+10 {{Function Name: foo}} // expected-remark@+9 {{ TotalSGPRs: 13}} From 1b8c7cf2c0b7c7b15296c34ec753bfb017deae81 Mon Sep 17 00:00:00 2001 From: anikelal Date: Sun, 6 Apr 2025 20:52:43 +0530 Subject: [PATCH 8/8] Minor fixes --- clang/lib/CodeGen/CGCall.cpp | 7 +++---- clang/lib/CodeGen/CodeGenFunction.cpp | 2 +- clang/lib/CodeGen/CodeGenModule.cpp | 2 -- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index ab200a63194f5..b25cdf9523ae1 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -2505,10 +2505,9 @@ void CodeGenModule::ConstructAttributeList(StringRef Name, if (TargetDecl->hasAttr() && CallingConv != CallingConv::CC_C && - CallingConv != - CallingConv::CC_SpirFunction) { // Check CallingConv to avoid adding - // uniform-work-group-size attribute - // to OpenCL Kernel Stub + CallingConv != CallingConv::CC_SpirFunction) { + // Check CallingConv to avoid adding uniform-work-group-size attribute to + // OpenCL Kernel Stub if (getLangOpts().OpenCLVersion <= 120) { // OpenCL v1.2 Work groups are always uniform FuncAttrs.addAttribute("uniform-work-group-size", "true"); diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index 650922dcfac3f..232d48141a0c0 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -1614,7 +1614,7 @@ void CodeGenFunction::GenerateCode(GlobalDecl GD, llvm::Function *Fn, CGM.getRawFunctionPointer(GDStub, FTy); CGCallee GDStubCallee = CGCallee::forDirect(GDStubFunctionPointer, GDStub); EmitCall(FnInfo, GDStubCallee, ReturnValueSlot(), CallArgs, nullptr, false, - Loc); // set IsMustTail=true? + Loc); } else if (Body) { EmitFunctionBody(Body); } else diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 3a3e9ac77670c..0154799498f5e 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -3893,7 +3893,6 @@ void CodeGenModule::EmitGlobal(GlobalDecl GD) { // Ignore declarations, they will be emitted on their first use. if (const auto *FD = dyn_cast(Global)) { - if (FD->hasAttr() && FD->doesThisDeclarationHaveABody()) addDeferredDeclToEmit(GlobalDecl(FD, KernelReferenceKind::Stub)); @@ -6148,7 +6147,6 @@ void CodeGenModule::EmitGlobalFunctionDefinition(GlobalDecl GD, CodeGenFunction(*this).GenerateCode(GD, Fn, FI); setNonAliasAttributes(GD, Fn); - SetLLVMFunctionAttributesForDefinition(D, Fn); if (const ConstructorAttr *CA = D->getAttr())