diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index f6ab81d3ca0bb..ca915cd3f3732 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -70,6 +70,10 @@ struct GenericToNVVMPass : PassInfoMixin { PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); }; +struct NVPTXCopyByValArgsPass : PassInfoMixin { + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + namespace NVPTX { enum DrvInterface { NVCL, diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp index bb76cfd6fdb7b..17d605d95eff3 100644 --- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp @@ -543,6 +543,33 @@ struct ArgUseChecker : PtrUseVisitor { PI.setAborted(&II); } }; // struct ArgUseChecker + +void copyByValParam(Function &F, Argument &Arg) { + LLVM_DEBUG(dbgs() << "Creating a local copy of " << Arg << "\n"); + // Otherwise we have to create a temporary copy. + BasicBlock::iterator FirstInst = F.getEntryBlock().begin(); + Type *StructType = Arg.getParamByValType(); + const DataLayout &DL = F.getDataLayout(); + AllocaInst *AllocA = new AllocaInst(StructType, DL.getAllocaAddrSpace(), + Arg.getName(), FirstInst); + // Set the alignment to alignment of the byval parameter. This is because, + // later load/stores assume that alignment, and we are going to replace + // the use of the byval parameter with this alloca instruction. + AllocA->setAlignment(F.getParamAlign(Arg.getArgNo()) + .value_or(DL.getPrefTypeAlign(StructType))); + Arg.replaceAllUsesWith(AllocA); + + Value *ArgInParam = new AddrSpaceCastInst( + &Arg, PointerType::get(Arg.getContext(), ADDRESS_SPACE_PARAM), + Arg.getName(), FirstInst); + // Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX + // addrspacecast preserves alignment. Since params are constant, this load + // is definitely not volatile. + const auto ArgSize = *AllocA->getAllocationSize(DL); + IRBuilder<> IRB(&*FirstInst); + IRB.CreateMemCpy(AllocA, AllocA->getAlign(), ArgInParam, AllocA->getAlign(), + ArgSize); +} } // namespace void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM, @@ -558,7 +585,7 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM, ArgUseChecker AUC(DL, IsGridConstant); ArgUseChecker::PtrInfo PI = AUC.visitArgPtr(*Arg); - bool ArgUseIsReadOnly = !(PI.isEscaped() || PI.isAborted()); + bool ArgUseIsReadOnly = !(PI.isEscaped() || PI.isAborted()); // Easy case, accessing parameter directly is fine. if (ArgUseIsReadOnly && AUC.Conditionals.empty()) { // Convert all loads and intermediate operations to use parameter AS and @@ -587,7 +614,6 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM, // However, we're still not allowed to write to it. If the user specified // `__grid_constant__` for the argument, we'll consider escaped pointer as // read-only. - unsigned AS = DL.getAllocaAddrSpace(); if (HasCvtaParam && (ArgUseIsReadOnly || IsGridConstant)) { LLVM_DEBUG(dbgs() << "Using non-copy pointer to " << *Arg << "\n"); // Replace all argument pointer uses (which might include a device function @@ -612,29 +638,8 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM, // Do not replace Arg in the cast to param space CastToParam->setOperand(0, Arg); - } else { - LLVM_DEBUG(dbgs() << "Creating a local copy of " << *Arg << "\n"); - // Otherwise we have to create a temporary copy. - AllocaInst *AllocA = - new AllocaInst(StructType, AS, Arg->getName(), FirstInst); - // Set the alignment to alignment of the byval parameter. This is because, - // later load/stores assume that alignment, and we are going to replace - // the use of the byval parameter with this alloca instruction. - AllocA->setAlignment(Func->getParamAlign(Arg->getArgNo()) - .value_or(DL.getPrefTypeAlign(StructType))); - Arg->replaceAllUsesWith(AllocA); - - Value *ArgInParam = new AddrSpaceCastInst( - Arg, PointerType::get(Arg->getContext(), ADDRESS_SPACE_PARAM), - Arg->getName(), FirstInst); - // Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX - // addrspacecast preserves alignment. Since params are constant, this load - // is definitely not volatile. - const auto ArgSize = *AllocA->getAllocationSize(DL); - IRBuilder<> IRB(&*FirstInst); - IRB.CreateMemCpy(AllocA, AllocA->getAlign(), ArgInParam, AllocA->getAlign(), - ArgSize); - } + } else + copyByValParam(*Func, *Arg); } void NVPTXLowerArgs::markPointerAsGlobal(Value *Ptr) { @@ -734,3 +739,22 @@ bool NVPTXLowerArgs::runOnFunction(Function &F) { } FunctionPass *llvm::createNVPTXLowerArgsPass() { return new NVPTXLowerArgs(); } + +static bool copyFunctionByValArgs(Function &F) { + LLVM_DEBUG(dbgs() << "Creating a copy of byval args of " << F.getName() + << "\n"); + bool Changed = false; + for (Argument &Arg : F.args()) + if (Arg.getType()->isPointerTy() && Arg.hasByValAttr() && + !(isParamGridConstant(Arg) && isKernelFunction(F))) { + copyByValParam(F, Arg); + Changed = true; + } + return Changed; +} + +PreservedAnalyses NVPTXCopyByValArgsPass::run(Function &F, + FunctionAnalysisManager &AM) { + return copyFunctionByValArgs(F) ? PreservedAnalyses::none() + : PreservedAnalyses::all(); +} diff --git a/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def b/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def index 6ff15ab6f13c4..28ea9dd9c0227 100644 --- a/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def +++ b/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def @@ -37,4 +37,5 @@ FUNCTION_ALIAS_ANALYSIS("nvptx-aa", NVPTXAA()) #endif FUNCTION_PASS("nvvm-intr-range", NVVMIntrRangePass()) FUNCTION_PASS("nvvm-reflect", NVVMReflectPass()) +FUNCTION_PASS("nvptx-copy-byval-args", NVPTXCopyByValArgsPass()) #undef FUNCTION_PASS diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 7d04cf3dc51e6..17f4c6ec18852 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -64,6 +64,29 @@ static cl::opt UseShortPointersOpt( "Use 32-bit pointers for accessing const/local/shared address spaces."), cl::init(false), cl::Hidden); +// byval arguments in NVPTX are special. We're only allowed to read from them +// using a special instruction, and if we ever need to write to them or take an +// address, we must make a local copy and use it, instead. +// +// The problem is that local copies are very expensive, and we create them very +// late in the compilation pipeline, so LLVM does not have much of a chance to +// eliminate them, if they turn out to be unnecessary. +// +// One way around that is to create such copies early on, and let them percolate +// through the optimizations. The copying itself will never trigger creation of +// another copy later on, as the reads are allowed. If LLVM can eliminate it, +// it's a win. It the full optimization pipeline can't remove the copy, that's +// as good as it gets in terms of the effort we could've done, and it's +// certainly a much better effort than what we do now. +// +// This early injection of the copies has potential to create undesireable +// side-effects, so it's disabled by default, for now, until it sees more +// testing. +static cl::opt EarlyByValArgsCopy( + "nvptx-early-byval-copy", + cl::desc("Create a copy of byval function arguments early."), + cl::init(false), cl::Hidden); + namespace llvm { void initializeGenericToNVVMLegacyPassPass(PassRegistry &); @@ -236,6 +259,8 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { // Note: NVVMIntrRangePass was causing numerical discrepancies at one // point, if issues crop up, consider disabling. FPM.addPass(NVVMIntrRangePass()); + if (EarlyByValArgsCopy) + FPM.addPass(NVPTXCopyByValArgsPass()); PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); }); } diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll index 013694277039f..5c52626a711fe 100644 --- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --scrub-attributes --version 5 ; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_60 -mattr=ptx77 -nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,SM_60 ; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_70 -mattr=ptx77 -nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,SM_70 +; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_70 -mattr=ptx77 -passes=nvptx-copy-byval-args -S | FileCheck %s --check-prefixes=COMMON,COPY source_filename = "" target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" target triple = "nvptx64-nvidia-cuda" @@ -24,15 +25,35 @@ declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #2 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local void @read_only(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; COMMON-LABEL: define dso_local void @read_only( -; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -; COMMON-NEXT: [[ENTRY:.*:]] -; COMMON-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; COMMON-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[S3]], align 4 -; COMMON-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 -; COMMON-NEXT: ret void +; SM_60-LABEL: define dso_local void @read_only( +; SM_60-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; SM_60-NEXT: [[ENTRY:.*:]] +; SM_60-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; SM_60-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[S3]], align 4 +; SM_60-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 +; SM_60-NEXT: ret void +; +; SM_70-LABEL: define dso_local void @read_only( +; SM_70-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; SM_70-NEXT: [[ENTRY:.*:]] +; SM_70-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; SM_70-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[S3]], align 4 +; SM_70-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 +; SM_70-NEXT: ret void +; +; COPY-LABEL: define dso_local void @read_only( +; COPY-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; COPY-NEXT: [[ENTRY:.*:]] +; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 +; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) +; COPY-NEXT: [[I:%.*]] = load i32, ptr [[S1]], align 4 +; COPY-NEXT: store i32 [[I]], ptr [[OUT]], align 4 +; COPY-NEXT: ret void ; entry: %i = load i32, ptr %s, align 4 @@ -42,16 +63,38 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local void @read_only_gep(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; COMMON-LABEL: define dso_local void @read_only_gep( -; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; COMMON-NEXT: [[ENTRY:.*:]] -; COMMON-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; COMMON-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4 -; COMMON-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4 -; COMMON-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 -; COMMON-NEXT: ret void +; SM_60-LABEL: define dso_local void @read_only_gep( +; SM_60-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; SM_60-NEXT: [[ENTRY:.*:]] +; SM_60-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; SM_60-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4 +; SM_60-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4 +; SM_60-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 +; SM_60-NEXT: ret void +; +; SM_70-LABEL: define dso_local void @read_only_gep( +; SM_70-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; SM_70-NEXT: [[ENTRY:.*:]] +; SM_70-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; SM_70-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4 +; SM_70-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4 +; SM_70-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 +; SM_70-NEXT: ret void +; +; COPY-LABEL: define dso_local void @read_only_gep( +; COPY-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COPY-NEXT: [[ENTRY:.*:]] +; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 +; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) +; COPY-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4 +; COPY-NEXT: [[I:%.*]] = load i32, ptr [[B]], align 4 +; COPY-NEXT: store i32 [[I]], ptr [[OUT]], align 4 +; COPY-NEXT: ret void ; entry: %b = getelementptr inbounds nuw i8, ptr %s, i64 4 @@ -62,16 +105,39 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local void @read_only_gep_asc(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; COMMON-LABEL: define dso_local void @read_only_gep_asc( -; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; COMMON-NEXT: [[ENTRY:.*:]] -; COMMON-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; COMMON-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4 -; COMMON-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4 -; COMMON-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 -; COMMON-NEXT: ret void +; SM_60-LABEL: define dso_local void @read_only_gep_asc( +; SM_60-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; SM_60-NEXT: [[ENTRY:.*:]] +; SM_60-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; SM_60-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4 +; SM_60-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4 +; SM_60-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 +; SM_60-NEXT: ret void +; +; SM_70-LABEL: define dso_local void @read_only_gep_asc( +; SM_70-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; SM_70-NEXT: [[ENTRY:.*:]] +; SM_70-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; SM_70-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4 +; SM_70-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4 +; SM_70-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 +; SM_70-NEXT: ret void +; +; COPY-LABEL: define dso_local void @read_only_gep_asc( +; COPY-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COPY-NEXT: [[ENTRY:.*:]] +; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 +; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) +; COPY-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4 +; COPY-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101) +; COPY-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[ASC]], align 4 +; COPY-NEXT: store i32 [[I]], ptr [[OUT]], align 4 +; COPY-NEXT: ret void ; entry: %b = getelementptr inbounds nuw i8, ptr %s, i64 4 @@ -83,20 +149,48 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local void @read_only_gep_asc0(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; COMMON-LABEL: define dso_local void @read_only_gep_asc0( -; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; COMMON-NEXT: [[ENTRY:.*:]] -; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 -; COMMON-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101) -; COMMON-NEXT: [[ASC0:%.*]] = addrspacecast ptr addrspace(101) [[ASC]] to ptr -; COMMON-NEXT: [[I:%.*]] = load i32, ptr [[ASC0]], align 4 -; COMMON-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 -; COMMON-NEXT: ret void +; SM_60-LABEL: define dso_local void @read_only_gep_asc0( +; SM_60-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; SM_60-NEXT: [[ENTRY:.*:]] +; SM_60-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; SM_60-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) +; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; SM_60-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 +; SM_60-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101) +; SM_60-NEXT: [[ASC0:%.*]] = addrspacecast ptr addrspace(101) [[ASC]] to ptr +; SM_60-NEXT: [[I:%.*]] = load i32, ptr [[ASC0]], align 4 +; SM_60-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 +; SM_60-NEXT: ret void +; +; SM_70-LABEL: define dso_local void @read_only_gep_asc0( +; SM_70-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; SM_70-NEXT: [[ENTRY:.*:]] +; SM_70-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; SM_70-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) +; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; SM_70-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 +; SM_70-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101) +; SM_70-NEXT: [[ASC0:%.*]] = addrspacecast ptr addrspace(101) [[ASC]] to ptr +; SM_70-NEXT: [[I:%.*]] = load i32, ptr [[ASC0]], align 4 +; SM_70-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 +; SM_70-NEXT: ret void +; +; COPY-LABEL: define dso_local void @read_only_gep_asc0( +; COPY-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COPY-NEXT: [[ENTRY:.*:]] +; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 +; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) +; COPY-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4 +; COPY-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101) +; COPY-NEXT: [[ASC0:%.*]] = addrspacecast ptr addrspace(101) [[ASC]] to ptr +; COPY-NEXT: [[I:%.*]] = load i32, ptr [[ASC0]], align 4 +; COPY-NEXT: store i32 [[I]], ptr [[OUT]], align 4 +; COPY-NEXT: ret void ; entry: %b = getelementptr inbounds nuw i8, ptr %s, i64 4 @@ -109,16 +203,36 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local void @escape_ptr(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; COMMON-LABEL: define dso_local void @escape_ptr( -; COMMON-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; COMMON-NEXT: [[ENTRY:.*:]] -; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S3]]) -; COMMON-NEXT: ret void +; SM_60-LABEL: define dso_local void @escape_ptr( +; SM_60-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; SM_60-NEXT: [[ENTRY:.*:]] +; SM_60-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; SM_60-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) +; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; SM_60-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S3]]) +; SM_60-NEXT: ret void +; +; SM_70-LABEL: define dso_local void @escape_ptr( +; SM_70-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; SM_70-NEXT: [[ENTRY:.*:]] +; SM_70-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; SM_70-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) +; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; SM_70-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S3]]) +; SM_70-NEXT: ret void +; +; COPY-LABEL: define dso_local void @escape_ptr( +; COPY-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COPY-NEXT: [[ENTRY:.*:]] +; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 +; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) +; COPY-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]]) +; COPY-NEXT: ret void ; entry: call void @_Z6escapePv(ptr noundef nonnull %s) #0 @@ -127,17 +241,39 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local void @escape_ptr_gep(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; COMMON-LABEL: define dso_local void @escape_ptr_gep( -; COMMON-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; COMMON-NEXT: [[ENTRY:.*:]] -; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 -; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) -; COMMON-NEXT: ret void +; SM_60-LABEL: define dso_local void @escape_ptr_gep( +; SM_60-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; SM_60-NEXT: [[ENTRY:.*:]] +; SM_60-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; SM_60-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) +; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; SM_60-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 +; SM_60-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) +; SM_60-NEXT: ret void +; +; SM_70-LABEL: define dso_local void @escape_ptr_gep( +; SM_70-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; SM_70-NEXT: [[ENTRY:.*:]] +; SM_70-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; SM_70-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) +; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; SM_70-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 +; SM_70-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) +; SM_70-NEXT: ret void +; +; COPY-LABEL: define dso_local void @escape_ptr_gep( +; COPY-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COPY-NEXT: [[ENTRY:.*:]] +; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 +; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) +; COPY-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4 +; COPY-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) +; COPY-NEXT: ret void ; entry: %b = getelementptr inbounds nuw i8, ptr %s, i64 4 @@ -147,16 +283,36 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local void @escape_ptr_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; COMMON-LABEL: define dso_local void @escape_ptr_store( -; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; COMMON-NEXT: [[ENTRY:.*:]] -; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; COMMON-NEXT: store ptr [[S3]], ptr [[OUT2]], align 8 -; COMMON-NEXT: ret void +; SM_60-LABEL: define dso_local void @escape_ptr_store( +; SM_60-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; SM_60-NEXT: [[ENTRY:.*:]] +; SM_60-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; SM_60-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) +; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; SM_60-NEXT: store ptr [[S3]], ptr [[OUT2]], align 8 +; SM_60-NEXT: ret void +; +; SM_70-LABEL: define dso_local void @escape_ptr_store( +; SM_70-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; SM_70-NEXT: [[ENTRY:.*:]] +; SM_70-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; SM_70-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) +; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; SM_70-NEXT: store ptr [[S3]], ptr [[OUT2]], align 8 +; SM_70-NEXT: ret void +; +; COPY-LABEL: define dso_local void @escape_ptr_store( +; COPY-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COPY-NEXT: [[ENTRY:.*:]] +; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 +; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) +; COPY-NEXT: store ptr [[S1]], ptr [[OUT]], align 8 +; COPY-NEXT: ret void ; entry: store ptr %s, ptr %out, align 8 @@ -165,17 +321,39 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local void @escape_ptr_gep_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; COMMON-LABEL: define dso_local void @escape_ptr_gep_store( -; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; COMMON-NEXT: [[ENTRY:.*:]] -; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 -; COMMON-NEXT: store ptr [[B]], ptr [[OUT2]], align 8 -; COMMON-NEXT: ret void +; SM_60-LABEL: define dso_local void @escape_ptr_gep_store( +; SM_60-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; SM_60-NEXT: [[ENTRY:.*:]] +; SM_60-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; SM_60-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) +; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; SM_60-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 +; SM_60-NEXT: store ptr [[B]], ptr [[OUT2]], align 8 +; SM_60-NEXT: ret void +; +; SM_70-LABEL: define dso_local void @escape_ptr_gep_store( +; SM_70-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; SM_70-NEXT: [[ENTRY:.*:]] +; SM_70-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; SM_70-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) +; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; SM_70-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 +; SM_70-NEXT: store ptr [[B]], ptr [[OUT2]], align 8 +; SM_70-NEXT: ret void +; +; COPY-LABEL: define dso_local void @escape_ptr_gep_store( +; COPY-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COPY-NEXT: [[ENTRY:.*:]] +; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 +; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) +; COPY-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4 +; COPY-NEXT: store ptr [[B]], ptr [[OUT]], align 8 +; COPY-NEXT: ret void ; entry: %b = getelementptr inbounds nuw i8, ptr %s, i64 4 @@ -185,17 +363,39 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local void @escape_ptrtoint(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; COMMON-LABEL: define dso_local void @escape_ptrtoint( -; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; COMMON-NEXT: [[ENTRY:.*:]] -; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; COMMON-NEXT: [[I:%.*]] = ptrtoint ptr [[S3]] to i64 -; COMMON-NEXT: store i64 [[I]], ptr [[OUT2]], align 8 -; COMMON-NEXT: ret void +; SM_60-LABEL: define dso_local void @escape_ptrtoint( +; SM_60-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; SM_60-NEXT: [[ENTRY:.*:]] +; SM_60-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; SM_60-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) +; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; SM_60-NEXT: [[I:%.*]] = ptrtoint ptr [[S3]] to i64 +; SM_60-NEXT: store i64 [[I]], ptr [[OUT2]], align 8 +; SM_60-NEXT: ret void +; +; SM_70-LABEL: define dso_local void @escape_ptrtoint( +; SM_70-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; SM_70-NEXT: [[ENTRY:.*:]] +; SM_70-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; SM_70-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) +; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; SM_70-NEXT: [[I:%.*]] = ptrtoint ptr [[S3]] to i64 +; SM_70-NEXT: store i64 [[I]], ptr [[OUT2]], align 8 +; SM_70-NEXT: ret void +; +; COPY-LABEL: define dso_local void @escape_ptrtoint( +; COPY-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COPY-NEXT: [[ENTRY:.*:]] +; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 +; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) +; COPY-NEXT: [[I:%.*]] = ptrtoint ptr [[S1]] to i64 +; COPY-NEXT: store i64 [[I]], ptr [[OUT]], align 8 +; COPY-NEXT: ret void ; entry: %i = ptrtoint ptr %s to i64 @@ -205,14 +405,32 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local void @memcpy_from_param(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; COMMON-LABEL: define dso_local void @memcpy_from_param( -; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; COMMON-NEXT: [[ENTRY:.*:]] -; COMMON-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true) -; COMMON-NEXT: ret void +; SM_60-LABEL: define dso_local void @memcpy_from_param( +; SM_60-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; SM_60-NEXT: [[ENTRY:.*:]] +; SM_60-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true) +; SM_60-NEXT: ret void +; +; SM_70-LABEL: define dso_local void @memcpy_from_param( +; SM_70-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; SM_70-NEXT: [[ENTRY:.*:]] +; SM_70-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true) +; SM_70-NEXT: ret void +; +; COPY-LABEL: define dso_local void @memcpy_from_param( +; COPY-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COPY-NEXT: [[ENTRY:.*:]] +; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 +; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) +; COPY-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[OUT]], ptr [[S1]], i64 16, i1 true) +; COPY-NEXT: ret void ; entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %s, i64 16, i1 true) @@ -221,14 +439,32 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local void @memcpy_from_param_noalign (ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) %s) local_unnamed_addr #0 { -; COMMON-LABEL: define dso_local void @memcpy_from_param_noalign( -; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; COMMON-NEXT: [[ENTRY:.*:]] -; COMMON-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true) -; COMMON-NEXT: ret void +; SM_60-LABEL: define dso_local void @memcpy_from_param_noalign( +; SM_60-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; SM_60-NEXT: [[ENTRY:.*:]] +; SM_60-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true) +; SM_60-NEXT: ret void +; +; SM_70-LABEL: define dso_local void @memcpy_from_param_noalign( +; SM_70-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; SM_70-NEXT: [[ENTRY:.*:]] +; SM_70-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true) +; SM_70-NEXT: ret void +; +; COPY-LABEL: define dso_local void @memcpy_from_param_noalign( +; COPY-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COPY-NEXT: [[ENTRY:.*:]] +; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 8 +; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[S1]], ptr addrspace(101) align 8 [[S2]], i64 8, i1 false) +; COPY-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[OUT]], ptr [[S1]], i64 16, i1 true) +; COPY-NEXT: ret void ; entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %s, i64 16, i1 true) @@ -237,16 +473,36 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr nocapture noundef readnone byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; COMMON-LABEL: define dso_local void @memcpy_to_param( -; COMMON-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef readnone byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; COMMON-NEXT: [[ENTRY:.*:]] -; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; COMMON-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1) -; COMMON-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr -; COMMON-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S3]], ptr [[IN2]], i64 16, i1 true) -; COMMON-NEXT: ret void +; SM_60-LABEL: define dso_local void @memcpy_to_param( +; SM_60-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef readnone byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; SM_60-NEXT: [[ENTRY:.*:]] +; SM_60-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; SM_60-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) +; SM_60-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1) +; SM_60-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr +; SM_60-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S3]], ptr [[IN2]], i64 16, i1 true) +; SM_60-NEXT: ret void +; +; SM_70-LABEL: define dso_local void @memcpy_to_param( +; SM_70-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef readnone byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; SM_70-NEXT: [[ENTRY:.*:]] +; SM_70-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; SM_70-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) +; SM_70-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1) +; SM_70-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr +; SM_70-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S3]], ptr [[IN2]], i64 16, i1 true) +; SM_70-NEXT: ret void +; +; COPY-LABEL: define dso_local void @memcpy_to_param( +; COPY-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef readnone byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COPY-NEXT: [[ENTRY:.*:]] +; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 +; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) +; COPY-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S1]], ptr [[IN]], i64 16, i1 true) +; COPY-NEXT: ret void ; entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %s, ptr %in, i64 16, i1 true) @@ -255,17 +511,39 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local void @copy_on_store(ptr nocapture noundef readonly %in, ptr nocapture noundef byval(%struct.S) align 4 %s, i1 noundef zeroext %b) local_unnamed_addr #0 { -; COMMON-LABEL: define dso_local void @copy_on_store( -; COMMON-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { -; COMMON-NEXT: [[BB:.*:]] -; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; COMMON-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1) -; COMMON-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr -; COMMON-NEXT: [[I:%.*]] = load i32, ptr [[IN2]], align 4 -; COMMON-NEXT: store i32 [[I]], ptr [[S3]], align 4 -; COMMON-NEXT: ret void +; SM_60-LABEL: define dso_local void @copy_on_store( +; SM_60-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +; SM_60-NEXT: [[BB:.*:]] +; SM_60-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; SM_60-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) +; SM_60-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1) +; SM_60-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr +; SM_60-NEXT: [[I:%.*]] = load i32, ptr [[IN2]], align 4 +; SM_60-NEXT: store i32 [[I]], ptr [[S3]], align 4 +; SM_60-NEXT: ret void +; +; SM_70-LABEL: define dso_local void @copy_on_store( +; SM_70-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +; SM_70-NEXT: [[BB:.*:]] +; SM_70-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; SM_70-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) +; SM_70-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1) +; SM_70-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr +; SM_70-NEXT: [[I:%.*]] = load i32, ptr [[IN2]], align 4 +; SM_70-NEXT: store i32 [[I]], ptr [[S3]], align 4 +; SM_70-NEXT: ret void +; +; COPY-LABEL: define dso_local void @copy_on_store( +; COPY-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COPY-NEXT: [[BB:.*:]] +; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 +; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) +; COPY-NEXT: [[I:%.*]] = load i32, ptr [[IN]], align 4 +; COPY-NEXT: store i32 [[I]], ptr [[S1]], align 4 +; COPY-NEXT: ret void ; bb: %i = load i32, ptr %in, align 4 @@ -304,6 +582,20 @@ define void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ; SM_70-NEXT: store i32 [[VALLOADED]], ptr [[OUT2]], align 4 ; SM_70-NEXT: ret void ; +; COPY-LABEL: define void @test_select( +; COPY-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] { +; COPY-NEXT: [[BB:.*:]] +; COPY-NEXT: [[INPUT23:%.*]] = alloca i32, align 4 +; COPY-NEXT: [[INPUT24:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT23]], ptr addrspace(101) align 4 [[INPUT24]], i64 4, i1 false) +; COPY-NEXT: [[INPUT11:%.*]] = alloca i32, align 4 +; COPY-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false) +; COPY-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT23]] +; COPY-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 +; COPY-NEXT: store i32 [[VALLOADED]], ptr [[OUT]], align 4 +; COPY-NEXT: ret void +; bb: %ptrnew = select i1 %cond, ptr %input1, ptr %input2 %valloaded = load i32, ptr %ptrnew, align 4 @@ -312,20 +604,48 @@ bb: } define void @test_select_write(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %out, i1 %cond) { -; COMMON-LABEL: define void @test_select_write( -; COMMON-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] { -; COMMON-NEXT: [[BB:.*:]] -; COMMON-NEXT: [[OUT7:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; COMMON-NEXT: [[OUT8:%.*]] = addrspacecast ptr addrspace(1) [[OUT7]] to ptr -; COMMON-NEXT: [[INPUT24:%.*]] = alloca i32, align 4 -; COMMON-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) -; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT24]], ptr addrspace(101) align 4 [[INPUT25]], i64 4, i1 false) -; COMMON-NEXT: [[INPUT11:%.*]] = alloca i32, align 4 -; COMMON-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) -; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false) -; COMMON-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT24]] -; COMMON-NEXT: store i32 1, ptr [[PTRNEW]], align 4 -; COMMON-NEXT: ret void +; SM_60-LABEL: define void @test_select_write( +; SM_60-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] { +; SM_60-NEXT: [[BB:.*:]] +; SM_60-NEXT: [[OUT5:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; SM_60-NEXT: [[OUT6:%.*]] = addrspacecast ptr addrspace(1) [[OUT5]] to ptr +; SM_60-NEXT: [[INPUT23:%.*]] = alloca i32, align 4 +; SM_60-NEXT: [[INPUT24:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT23]], ptr addrspace(101) align 4 [[INPUT24]], i64 4, i1 false) +; SM_60-NEXT: [[INPUT11:%.*]] = alloca i32, align 4 +; SM_60-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false) +; SM_60-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT23]] +; SM_60-NEXT: store i32 1, ptr [[PTRNEW]], align 4 +; SM_60-NEXT: ret void +; +; SM_70-LABEL: define void @test_select_write( +; SM_70-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] { +; SM_70-NEXT: [[BB:.*:]] +; SM_70-NEXT: [[OUT5:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; SM_70-NEXT: [[OUT6:%.*]] = addrspacecast ptr addrspace(1) [[OUT5]] to ptr +; SM_70-NEXT: [[INPUT23:%.*]] = alloca i32, align 4 +; SM_70-NEXT: [[INPUT24:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT23]], ptr addrspace(101) align 4 [[INPUT24]], i64 4, i1 false) +; SM_70-NEXT: [[INPUT11:%.*]] = alloca i32, align 4 +; SM_70-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false) +; SM_70-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT23]] +; SM_70-NEXT: store i32 1, ptr [[PTRNEW]], align 4 +; SM_70-NEXT: ret void +; +; COPY-LABEL: define void @test_select_write( +; COPY-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] { +; COPY-NEXT: [[BB:.*:]] +; COPY-NEXT: [[INPUT23:%.*]] = alloca i32, align 4 +; COPY-NEXT: [[INPUT24:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT23]], ptr addrspace(101) align 4 [[INPUT24]], i64 4, i1 false) +; COPY-NEXT: [[INPUT11:%.*]] = alloca i32, align 4 +; COPY-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false) +; COPY-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT23]] +; COPY-NEXT: store i32 1, ptr [[PTRNEW]], align 4 +; COPY-NEXT: ret void ; bb: %ptrnew = select i1 %cond, ptr %input1, ptr %input2 @@ -380,6 +700,28 @@ define void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S) ; SM_70-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4 ; SM_70-NEXT: ret void ; +; COPY-LABEL: define void @test_phi( +; COPY-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] { +; COPY-NEXT: [[BB:.*:]] +; COPY-NEXT: [[INPUT23:%.*]] = alloca [[STRUCT_S]], align 8 +; COPY-NEXT: [[INPUT24:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT23]], ptr addrspace(101) align 8 [[INPUT24]], i64 8, i1 false) +; COPY-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4 +; COPY-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 8, i1 false) +; COPY-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; COPY: [[FIRST]]: +; COPY-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0 +; COPY-NEXT: br label %[[MERGE:.*]] +; COPY: [[SECOND]]: +; COPY-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT23]], i32 0, i32 1 +; COPY-NEXT: br label %[[MERGE]] +; COPY: [[MERGE]]: +; COPY-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] +; COPY-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 +; COPY-NEXT: store i32 [[VALLOADED]], ptr [[INOUT]], align 4 +; COPY-NEXT: ret void +; bb: br i1 %cond, label %first, label %second @@ -400,7 +742,7 @@ merge: ; preds = %second, %first define void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S) %input2, i1 %cond) { ; COMMON-LABEL: define void @test_phi_write( -; COMMON-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] { +; COMMON-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] { ; COMMON-NEXT: [[BB:.*:]] ; COMMON-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8 ; COMMON-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)