diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp index 86b2c4f78fc3e..608b43b59eed3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp @@ -45,16 +45,16 @@ // // This pass proceeds in three main phases: // -// ## Rewriting loads and stores of p7 +// ## Rewriting loads and stores of p7 and memcpy()-like handling // // The first phase is to rewrite away all loads and stors of `ptr addrspace(7)`, // including aggregates containing such pointers, to ones that use `i160`. This -// is handled by `StoreFatPtrsAsIntsVisitor` , which visits loads, stores, and -// allocas and, if the loaded or stored type contains `ptr addrspace(7)`, -// rewrites that type to one where the p7s are replaced by i160s, copying other -// parts of aggregates as needed. In the case of a store, each pointer is -// `ptrtoint`d to i160 before storing, and load integers are `inttoptr`d back. -// This same transformation is applied to vectors of pointers. +// is handled by `StoreFatPtrsAsIntsAndExpandMemcpyVisitor` , which visits +// loads, stores, and allocas and, if the loaded or stored type contains `ptr +// addrspace(7)`, rewrites that type to one where the p7s are replaced by i160s, +// copying other parts of aggregates as needed. In the case of a store, each +// pointer is `ptrtoint`d to i160 before storing, and load integers are +// `inttoptr`d back. This same transformation is applied to vectors of pointers. // // Such a transformation allows the later phases of the pass to not need // to handle buffer fat pointers moving to and from memory, where we load @@ -66,6 +66,10 @@ // Atomics operations on `ptr addrspace(7)` values are not suppported, as the // hardware does not include a 160-bit atomic. // +// In order to save on O(N) work and to ensure that the contents type +// legalizer correctly splits up wide loads, also unconditionally lower +// memcpy-like intrinsics into loops here. +// // ## Buffer contents type legalization // // The underlying buffer intrinsics only support types up to 128 bits long, @@ -231,20 +235,24 @@ #include "llvm/IR/InstIterator.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ReplaceConstant.h" +#include "llvm/IR/ValueHandle.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/AMDGPUAddrSpace.h" #include "llvm/Support/Alignment.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LowerMemIntrinsics.h" #include "llvm/Transforms/Utils/ValueMapper.h" #define DEBUG_TYPE "amdgpu-lower-buffer-fat-pointers" @@ -431,14 +439,16 @@ namespace { /// marshalling costs when reading or storing these values, but since placing /// such pointers into memory is an uncommon operation at best, we feel that /// this cost is acceptable for better performance in the common case. -class StoreFatPtrsAsIntsVisitor - : public InstVisitor { +class StoreFatPtrsAsIntsAndExpandMemcpyVisitor + : public InstVisitor { BufferFatPtrToIntTypeMap *TypeMap; ValueToValueMapTy ConvertedForStore; IRBuilder<> IRB; + const TargetMachine *TM; + // Convert all the buffer fat pointers within the input value to inttegers // so that it can be stored in memory. Value *fatPtrsToInts(Value *V, Type *From, Type *To, const Twine &Name); @@ -448,8 +458,10 @@ class StoreFatPtrsAsIntsVisitor Value *intsToFatPtrs(Value *V, Type *From, Type *To, const Twine &Name); public: - StoreFatPtrsAsIntsVisitor(BufferFatPtrToIntTypeMap *TypeMap, LLVMContext &Ctx) - : TypeMap(TypeMap), IRB(Ctx) {} + StoreFatPtrsAsIntsAndExpandMemcpyVisitor(BufferFatPtrToIntTypeMap *TypeMap, + LLVMContext &Ctx, + const TargetMachine *TM) + : TypeMap(TypeMap), IRB(Ctx), TM(TM) {} bool processFunction(Function &F); bool visitInstruction(Instruction &I) { return false; } @@ -457,11 +469,16 @@ class StoreFatPtrsAsIntsVisitor bool visitLoadInst(LoadInst &LI); bool visitStoreInst(StoreInst &SI); bool visitGetElementPtrInst(GetElementPtrInst &I); + + bool visitMemCpyInst(MemCpyInst &MCI); + bool visitMemMoveInst(MemMoveInst &MMI); + bool visitMemSetInst(MemSetInst &MSI); + bool visitMemSetPatternInst(MemSetPatternInst &MSPI); }; } // namespace -Value *StoreFatPtrsAsIntsVisitor::fatPtrsToInts(Value *V, Type *From, Type *To, - const Twine &Name) { +Value *StoreFatPtrsAsIntsAndExpandMemcpyVisitor::fatPtrsToInts( + Value *V, Type *From, Type *To, const Twine &Name) { if (From == To) return V; ValueToValueMapTy::iterator Find = ConvertedForStore.find(V); @@ -498,8 +515,8 @@ Value *StoreFatPtrsAsIntsVisitor::fatPtrsToInts(Value *V, Type *From, Type *To, return Ret; } -Value *StoreFatPtrsAsIntsVisitor::intsToFatPtrs(Value *V, Type *From, Type *To, - const Twine &Name) { +Value *StoreFatPtrsAsIntsAndExpandMemcpyVisitor::intsToFatPtrs( + Value *V, Type *From, Type *To, const Twine &Name) { if (From == To) return V; if (isBufferFatPtrOrVector(To)) { @@ -531,18 +548,25 @@ Value *StoreFatPtrsAsIntsVisitor::intsToFatPtrs(Value *V, Type *From, Type *To, return Ret; } -bool StoreFatPtrsAsIntsVisitor::processFunction(Function &F) { +bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::processFunction(Function &F) { bool Changed = false; - // The visitors will mutate GEPs and allocas, but will push loads and stores - // to the worklist to avoid invalidation. + // Process memcpy-like instructions after the main iteration because they can + // invalidate iterators. + SmallVector CanBecomeLoops; for (Instruction &I : make_early_inc_range(instructions(F))) { - Changed |= visit(I); + if (isa(I)) + CanBecomeLoops.push_back(&I); + else + Changed |= visit(I); + } + for (WeakTrackingVH VH : make_early_inc_range(CanBecomeLoops)) { + Changed |= visit(cast(VH)); } ConvertedForStore.clear(); return Changed; } -bool StoreFatPtrsAsIntsVisitor::visitAllocaInst(AllocaInst &I) { +bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitAllocaInst(AllocaInst &I) { Type *Ty = I.getAllocatedType(); Type *NewTy = TypeMap->remapType(Ty); if (Ty == NewTy) @@ -551,7 +575,8 @@ bool StoreFatPtrsAsIntsVisitor::visitAllocaInst(AllocaInst &I) { return true; } -bool StoreFatPtrsAsIntsVisitor::visitGetElementPtrInst(GetElementPtrInst &I) { +bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitGetElementPtrInst( + GetElementPtrInst &I) { Type *Ty = I.getSourceElementType(); Type *NewTy = TypeMap->remapType(Ty); if (Ty == NewTy) @@ -563,7 +588,7 @@ bool StoreFatPtrsAsIntsVisitor::visitGetElementPtrInst(GetElementPtrInst &I) { return true; } -bool StoreFatPtrsAsIntsVisitor::visitLoadInst(LoadInst &LI) { +bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitLoadInst(LoadInst &LI) { Type *Ty = LI.getType(); Type *IntTy = TypeMap->remapType(Ty); if (Ty == IntTy) @@ -581,7 +606,7 @@ bool StoreFatPtrsAsIntsVisitor::visitLoadInst(LoadInst &LI) { return true; } -bool StoreFatPtrsAsIntsVisitor::visitStoreInst(StoreInst &SI) { +bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitStoreInst(StoreInst &SI) { Value *V = SI.getValueOperand(); Type *Ty = V->getType(); Type *IntTy = TypeMap->remapType(Ty); @@ -597,6 +622,47 @@ bool StoreFatPtrsAsIntsVisitor::visitStoreInst(StoreInst &SI) { return true; } +bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemCpyInst( + MemCpyInst &MCI) { + // TODO: Allow memcpy.p7.p3 as a synonym for the direct-to-LDS copy, which'll + // need loop expansion here. + if (MCI.getSourceAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER && + MCI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER) + return false; + llvm::expandMemCpyAsLoop(&MCI, + TM->getTargetTransformInfo(*MCI.getFunction())); + MCI.eraseFromParent(); + return true; +} + +bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemMoveInst( + MemMoveInst &MMI) { + if (MMI.getSourceAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER && + MMI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER) + return false; + report_fatal_error( + "memmove() on buffer descriptors is not implemented because pointer " + "comparison on buffer descriptors isn't implemented\n"); +} + +bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemSetInst( + MemSetInst &MSI) { + if (MSI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER) + return false; + llvm::expandMemSetAsLoop(&MSI); + MSI.eraseFromParent(); + return true; +} + +bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemSetPatternInst( + MemSetPatternInst &MSPI) { + if (MSPI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER) + return false; + llvm::expandMemSetPatternAsLoop(&MSPI); + MSPI.eraseFromParent(); + return true; +} + namespace { /// Convert loads/stores of types that the buffer intrinsics can't handle into /// one ore more such loads/stores that consist of legal types. @@ -1127,6 +1193,7 @@ bool LegalizeBufferContentTypesVisitor::visitStoreInst(StoreInst &SI) { bool LegalizeBufferContentTypesVisitor::processFunction(Function &F) { bool Changed = false; + // Note, memory transfer intrinsics won't for (Instruction &I : make_early_inc_range(instructions(F))) { Changed |= visit(I); } @@ -2084,6 +2151,12 @@ static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) { case Intrinsic::invariant_end: case Intrinsic::launder_invariant_group: case Intrinsic::strip_invariant_group: + case Intrinsic::memcpy: + case Intrinsic::memcpy_inline: + case Intrinsic::memmove: + case Intrinsic::memset: + case Intrinsic::memset_inline: + case Intrinsic::experimental_memset_pattern: return true; } } @@ -2353,7 +2426,8 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) { /*RemoveDeadConstants=*/false, /*IncludeSelf=*/true); } - StoreFatPtrsAsIntsVisitor MemOpsRewrite(&IntTM, M.getContext()); + StoreFatPtrsAsIntsAndExpandMemcpyVisitor MemOpsRewrite(&IntTM, M.getContext(), + &TM); LegalizeBufferContentTypesVisitor BufferContentsTypeRewrite(DL, M.getContext()); for (Function &F : M.functions()) { diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll new file mode 100644 index 0000000000000..8e023723ec25c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll @@ -0,0 +1,1345 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefix=SDAG-GFX942 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=SDAG-GFX1100 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -check-prefix=GISEL-GFX942 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -check-prefix=GISEL-GFX1100 %s + +; Note: if you're adding tests here, also add them to +; lower-buffer-fat-pointers-mem-transfer.ll to verify the IR produced by +; the lowering. +; +; This file is a sanity check to make sure that the code generated +; for buffer-related memcpy() calls turns into something reasonable in +; the backend, despite the wide intermediate vectors + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8" +target triple = "amdgcn--" + +;; memcpy + +declare void @llvm.memcpy.p7.p7.i32(ptr addrspace(7), ptr addrspace(7), i32, i1) + +define amdgpu_kernel void @memcpy_known(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %dst) { +; SDAG-LABEL: memcpy_known: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s7, s24 +; SDAG-NEXT: s_mov_b32 s6, s23 +; SDAG-NEXT: s_mov_b32 s5, s22 +; SDAG-NEXT: s_mov_b32 s4, s21 +; SDAG-NEXT: s_mov_b32 s8, 0 +; SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SDAG-NEXT: .LBB0_1: ; %load-store-loop +; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-NEXT: s_add_i32 s9, s20, s8 +; SDAG-NEXT: v_mov_b32_e32 v60, s9 +; SDAG-NEXT: buffer_load_dwordx4 v[0:3], v60, s[16:19], 0 offen +; SDAG-NEXT: s_add_i32 s9, s25, s8 +; SDAG-NEXT: s_addk_i32 s8, 0x100 +; SDAG-NEXT: s_cmpk_lt_u32 s8, 0x2000 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_load_dwordx4 v[0:3], v60, s[16:19], 0 offen offset:16 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_load_dwordx4 v[8:11], v60, s[16:19], 0 offen offset:32 +; SDAG-NEXT: buffer_load_dwordx4 v[12:15], v60, s[16:19], 0 offen offset:48 +; SDAG-NEXT: buffer_load_dwordx4 v[16:19], v60, s[16:19], 0 offen offset:64 +; SDAG-NEXT: buffer_load_dwordx4 v[20:23], v60, s[16:19], 0 offen offset:80 +; SDAG-NEXT: buffer_load_dwordx4 v[24:27], v60, s[16:19], 0 offen offset:96 +; SDAG-NEXT: buffer_load_dwordx4 v[28:31], v60, s[16:19], 0 offen offset:112 +; SDAG-NEXT: buffer_load_dwordx4 v[32:35], v60, s[16:19], 0 offen offset:128 +; SDAG-NEXT: buffer_load_dwordx4 v[36:39], v60, s[16:19], 0 offen offset:144 +; SDAG-NEXT: buffer_load_dwordx4 v[48:51], v60, s[16:19], 0 offen offset:160 +; SDAG-NEXT: buffer_load_dwordx4 v[52:55], v60, s[16:19], 0 offen offset:176 +; SDAG-NEXT: buffer_load_dwordx4 v[40:43], v60, s[16:19], 0 offen offset:192 +; SDAG-NEXT: buffer_load_dwordx4 v[44:47], v60, s[16:19], 0 offen offset:208 +; SDAG-NEXT: buffer_load_dwordx4 v[56:59], v60, s[16:19], 0 offen offset:224 +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: buffer_load_dwordx4 v[60:63], v60, s[16:19], 0 offen offset:240 +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SDAG-NEXT: v_mov_b32_e32 v0, s9 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: buffer_store_dwordx4 v[1:4], v0, s[4:7], 0 offen +; SDAG-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: buffer_store_dwordx4 v[1:4], v0, s[4:7], 0 offen offset:16 +; SDAG-NEXT: buffer_store_dwordx4 v[8:11], v0, s[4:7], 0 offen offset:32 +; SDAG-NEXT: buffer_store_dwordx4 v[12:15], v0, s[4:7], 0 offen offset:48 +; SDAG-NEXT: buffer_store_dwordx4 v[16:19], v0, s[4:7], 0 offen offset:64 +; SDAG-NEXT: buffer_store_dwordx4 v[20:23], v0, s[4:7], 0 offen offset:80 +; SDAG-NEXT: buffer_store_dwordx4 v[24:27], v0, s[4:7], 0 offen offset:96 +; SDAG-NEXT: buffer_store_dwordx4 v[28:31], v0, s[4:7], 0 offen offset:112 +; SDAG-NEXT: buffer_store_dwordx4 v[32:35], v0, s[4:7], 0 offen offset:128 +; SDAG-NEXT: buffer_store_dwordx4 v[36:39], v0, s[4:7], 0 offen offset:144 +; SDAG-NEXT: buffer_store_dwordx4 v[48:51], v0, s[4:7], 0 offen offset:160 +; SDAG-NEXT: buffer_store_dwordx4 v[52:55], v0, s[4:7], 0 offen offset:176 +; SDAG-NEXT: buffer_store_dwordx4 v[40:43], v0, s[4:7], 0 offen offset:192 +; SDAG-NEXT: buffer_store_dwordx4 v[44:47], v0, s[4:7], 0 offen offset:208 +; SDAG-NEXT: buffer_store_dwordx4 v[56:59], v0, s[4:7], 0 offen offset:224 +; SDAG-NEXT: buffer_store_dwordx4 v[60:63], v0, s[4:7], 0 offen offset:240 +; SDAG-NEXT: s_cbranch_scc1 .LBB0_1 +; SDAG-NEXT: ; %bb.2: ; %memcpy-split +; SDAG-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: memcpy_known: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s8, 0 +; GISEL-NEXT: s_mov_b32 s4, s21 +; GISEL-NEXT: s_mov_b32 s5, s22 +; GISEL-NEXT: s_mov_b32 s6, s23 +; GISEL-NEXT: s_mov_b32 s7, s24 +; GISEL-NEXT: v_mov_b32_e32 v0, 0x2000 +; GISEL-NEXT: v_mov_b32_e32 v1, s8 +; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GISEL-NEXT: .LBB0_1: ; %load-store-loop +; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GISEL-NEXT: v_add_u32_e32 v46, s20, v1 +; GISEL-NEXT: buffer_load_dwordx4 v[2:5], v46, s[16:19], 0 offen +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_load_dwordx4 v[6:9], v46, s[16:19], 0 offen offset:16 +; GISEL-NEXT: buffer_load_dwordx4 v[10:13], v46, s[16:19], 0 offen offset:32 +; GISEL-NEXT: buffer_load_dwordx4 v[14:17], v46, s[16:19], 0 offen offset:48 +; GISEL-NEXT: buffer_load_dwordx4 v[18:21], v46, s[16:19], 0 offen offset:64 +; GISEL-NEXT: buffer_load_dwordx4 v[22:25], v46, s[16:19], 0 offen offset:80 +; GISEL-NEXT: buffer_load_dwordx4 v[26:29], v46, s[16:19], 0 offen offset:96 +; GISEL-NEXT: buffer_load_dwordx4 v[30:33], v46, s[16:19], 0 offen offset:112 +; GISEL-NEXT: buffer_load_dwordx4 v[34:37], v46, s[16:19], 0 offen offset:128 +; GISEL-NEXT: buffer_load_dwordx4 v[48:51], v46, s[16:19], 0 offen offset:144 +; GISEL-NEXT: buffer_load_dwordx4 v[52:55], v46, s[16:19], 0 offen offset:160 +; GISEL-NEXT: buffer_load_dwordx4 v[38:41], v46, s[16:19], 0 offen offset:176 +; GISEL-NEXT: buffer_load_dwordx4 v[42:45], v46, s[16:19], 0 offen offset:192 +; GISEL-NEXT: buffer_load_dwordx4 v[56:59], v46, s[16:19], 0 offen offset:208 +; GISEL-NEXT: buffer_load_dwordx4 v[60:63], v46, s[16:19], 0 offen offset:224 +; GISEL-NEXT: buffer_load_dwordx4 v[2:5], v46, s[16:19], 0 offen offset:240 +; GISEL-NEXT: v_add_u32_e32 v46, s25, v1 +; GISEL-NEXT: v_add_u32_e32 v1, 0x100, v1 +; GISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: buffer_store_dwordx4 v[2:5], v46, s[4:7], 0 offen +; GISEL-NEXT: buffer_store_dwordx4 v[6:9], v46, s[4:7], 0 offen offset:16 +; GISEL-NEXT: buffer_store_dwordx4 v[10:13], v46, s[4:7], 0 offen offset:32 +; GISEL-NEXT: buffer_store_dwordx4 v[14:17], v46, s[4:7], 0 offen offset:48 +; GISEL-NEXT: buffer_store_dwordx4 v[18:21], v46, s[4:7], 0 offen offset:64 +; GISEL-NEXT: buffer_store_dwordx4 v[22:25], v46, s[4:7], 0 offen offset:80 +; GISEL-NEXT: buffer_store_dwordx4 v[26:29], v46, s[4:7], 0 offen offset:96 +; GISEL-NEXT: buffer_store_dwordx4 v[30:33], v46, s[4:7], 0 offen offset:112 +; GISEL-NEXT: buffer_store_dwordx4 v[34:37], v46, s[4:7], 0 offen offset:128 +; GISEL-NEXT: buffer_store_dwordx4 v[48:51], v46, s[4:7], 0 offen offset:144 +; GISEL-NEXT: buffer_store_dwordx4 v[52:55], v46, s[4:7], 0 offen offset:160 +; GISEL-NEXT: buffer_store_dwordx4 v[38:41], v46, s[4:7], 0 offen offset:176 +; GISEL-NEXT: buffer_store_dwordx4 v[42:45], v46, s[4:7], 0 offen offset:192 +; GISEL-NEXT: buffer_store_dwordx4 v[56:59], v46, s[4:7], 0 offen offset:208 +; GISEL-NEXT: buffer_store_dwordx4 v[60:63], v46, s[4:7], 0 offen offset:224 +; GISEL-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: buffer_store_dwordx4 v[2:5], v46, s[4:7], 0 offen offset:240 +; GISEL-NEXT: s_cbranch_vccnz .LBB0_1 +; GISEL-NEXT: ; %bb.2: ; %memcpy-split +; GISEL-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX942-LABEL: memcpy_known: +; SDAG-GFX942: ; %bb.3: +; SDAG-GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; SDAG-GFX942-NEXT: s_load_dword s12, s[4:5], 0x10 +; SDAG-GFX942-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX942-NEXT: s_branch .LBB0_0 +; SDAG-GFX942-NEXT: .p2align 8 +; SDAG-GFX942-NEXT: ; %bb.4: +; SDAG-GFX942-NEXT: .LBB0_0: +; SDAG-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; SDAG-GFX942-NEXT: s_load_dword s17, s[4:5], 0x34 +; SDAG-GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x44 +; SDAG-GFX942-NEXT: s_load_dword s12, s[4:5], 0x54 +; SDAG-GFX942-NEXT: s_mov_b32 s16, 0 +; SDAG-GFX942-NEXT: s_mov_b32 s5, s16 +; SDAG-GFX942-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX942-NEXT: s_mov_b32 s4, s3 +; SDAG-GFX942-NEXT: s_or_b64 s[6:7], s[4:5], s[16:17] +; SDAG-GFX942-NEXT: s_mov_b32 s17, s2 +; SDAG-GFX942-NEXT: s_mov_b32 s2, s1 +; SDAG-GFX942-NEXT: s_mov_b32 s3, s16 +; SDAG-GFX942-NEXT: s_or_b64 s[4:5], s[2:3], s[16:17] +; SDAG-GFX942-NEXT: s_mov_b32 s17, s12 +; SDAG-GFX942-NEXT: s_mov_b32 s2, s11 +; SDAG-GFX942-NEXT: s_or_b64 s[14:15], s[2:3], s[16:17] +; SDAG-GFX942-NEXT: s_mov_b32 s17, s10 +; SDAG-GFX942-NEXT: s_mov_b32 s2, s9 +; SDAG-GFX942-NEXT: s_or_b64 s[12:13], s[2:3], s[16:17] +; SDAG-GFX942-NEXT: .LBB0_1: ; %load-store-loop +; SDAG-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-GFX942-NEXT: s_add_i32 s1, s0, s16 +; SDAG-GFX942-NEXT: v_mov_b32_e32 v60, s1 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[8:11], v60, s[4:7], 0 offen +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[4:7], v60, s[4:7], 0 offen offset:16 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[12:15], v60, s[4:7], 0 offen offset:32 +; SDAG-GFX942-NEXT: s_add_i32 s2, s8, s16 +; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s2 +; SDAG-GFX942-NEXT: s_addk_i32 s16, 0x100 +; SDAG-GFX942-NEXT: s_cmpk_lt_u32 s16, 0x2000 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0) +; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a0, v15 ; Reload Reuse +; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a1, v14 ; Reload Reuse +; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a2, v13 ; Reload Reuse +; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a3, v12 ; Reload Reuse +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[12:15], v60, s[4:7], 0 offen offset:48 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[16:19], v60, s[4:7], 0 offen offset:64 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[20:23], v60, s[4:7], 0 offen offset:80 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[24:27], v60, s[4:7], 0 offen offset:96 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[28:31], v60, s[4:7], 0 offen offset:112 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[32:35], v60, s[4:7], 0 offen offset:128 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[36:39], v60, s[4:7], 0 offen offset:144 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[40:43], v60, s[4:7], 0 offen offset:160 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[44:47], v60, s[4:7], 0 offen offset:176 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[48:51], v60, s[4:7], 0 offen offset:192 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[52:55], v60, s[4:7], 0 offen offset:208 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[56:59], v60, s[4:7], 0 offen offset:224 +; SDAG-GFX942-NEXT: s_nop 0 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[60:63], v60, s[4:7], 0 offen offset:240 +; SDAG-GFX942-NEXT: s_nop 0 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[8:11], v0, s[12:15], 0 offen +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[4:7], v0, s[12:15], 0 offen offset:16 +; SDAG-GFX942-NEXT: s_nop 1 +; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse +; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v4, a1 ; Reload Reuse +; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse +; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v2, a3 ; Reload Reuse +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v0, s[12:15], 0 offen offset:32 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[12:15], v0, s[12:15], 0 offen offset:48 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[16:19], v0, s[12:15], 0 offen offset:64 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[20:23], v0, s[12:15], 0 offen offset:80 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[24:27], v0, s[12:15], 0 offen offset:96 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[28:31], v0, s[12:15], 0 offen offset:112 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[32:35], v0, s[12:15], 0 offen offset:128 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[36:39], v0, s[12:15], 0 offen offset:144 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[40:43], v0, s[12:15], 0 offen offset:160 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[44:47], v0, s[12:15], 0 offen offset:176 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[48:51], v0, s[12:15], 0 offen offset:192 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[52:55], v0, s[12:15], 0 offen offset:208 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[56:59], v0, s[12:15], 0 offen offset:224 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[60:63], v0, s[12:15], 0 offen offset:240 +; SDAG-GFX942-NEXT: s_cbranch_scc1 .LBB0_1 +; SDAG-GFX942-NEXT: ; %bb.2: ; %memcpy-split +; SDAG-GFX942-NEXT: s_endpgm +; +; SDAG-GFX1100-LABEL: memcpy_known: +; SDAG-GFX1100: ; %bb.0: +; SDAG-GFX1100-NEXT: s_clause 0x3 +; SDAG-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; SDAG-GFX1100-NEXT: s_load_b32 s17, s[4:5], 0x34 +; SDAG-GFX1100-NEXT: s_load_b128 s[8:11], s[4:5], 0x44 +; SDAG-GFX1100-NEXT: s_load_b32 s18, s[4:5], 0x54 +; SDAG-GFX1100-NEXT: s_mov_b32 s16, 0 +; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; SDAG-GFX1100-NEXT: s_mov_b32 s5, s16 +; SDAG-GFX1100-NEXT: s_mov_b32 s13, s16 +; SDAG-GFX1100-NEXT: s_mov_b32 s15, s16 +; SDAG-GFX1100-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX1100-NEXT: s_mov_b32 s4, s3 +; SDAG-GFX1100-NEXT: s_mov_b32 s12, s1 +; SDAG-GFX1100-NEXT: s_or_b64 s[6:7], s[4:5], s[16:17] +; SDAG-GFX1100-NEXT: s_mov_b32 s17, s2 +; SDAG-GFX1100-NEXT: s_mov_b32 s14, s11 +; SDAG-GFX1100-NEXT: s_or_b64 s[4:5], s[12:13], s[16:17] +; SDAG-GFX1100-NEXT: s_mov_b32 s17, s18 +; SDAG-GFX1100-NEXT: s_mov_b32 s2, s9 +; SDAG-GFX1100-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] +; SDAG-GFX1100-NEXT: s_mov_b32 s17, s10 +; SDAG-GFX1100-NEXT: s_mov_b32 s3, s16 +; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; SDAG-GFX1100-NEXT: s_or_b64 s[12:13], s[2:3], s[16:17] +; SDAG-GFX1100-NEXT: .LBB0_1: ; %load-store-loop +; SDAG-GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-GFX1100-NEXT: s_add_i32 s1, s0, s16 +; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; SDAG-GFX1100-NEXT: v_mov_b32_e32 v60, s1 +; SDAG-GFX1100-NEXT: s_add_i32 s1, s8, s16 +; SDAG-GFX1100-NEXT: s_addk_i32 s16, 0x100 +; SDAG-GFX1100-NEXT: v_mov_b32_e32 v64, s1 +; SDAG-GFX1100-NEXT: s_cmpk_lt_u32 s16, 0x2000 +; SDAG-GFX1100-NEXT: s_clause 0xf +; SDAG-GFX1100-NEXT: buffer_load_b128 v[0:3], v60, s[4:7], 0 offen +; SDAG-GFX1100-NEXT: buffer_load_b128 v[4:7], v60, s[4:7], 0 offen offset:16 +; SDAG-GFX1100-NEXT: buffer_load_b128 v[8:11], v60, s[4:7], 0 offen offset:32 +; SDAG-GFX1100-NEXT: buffer_load_b128 v[12:15], v60, s[4:7], 0 offen offset:48 +; SDAG-GFX1100-NEXT: buffer_load_b128 v[16:19], v60, s[4:7], 0 offen offset:64 +; SDAG-GFX1100-NEXT: buffer_load_b128 v[20:23], v60, s[4:7], 0 offen offset:80 +; SDAG-GFX1100-NEXT: buffer_load_b128 v[24:27], v60, s[4:7], 0 offen offset:96 +; SDAG-GFX1100-NEXT: buffer_load_b128 v[28:31], v60, s[4:7], 0 offen offset:112 +; SDAG-GFX1100-NEXT: buffer_load_b128 v[32:35], v60, s[4:7], 0 offen offset:128 +; SDAG-GFX1100-NEXT: buffer_load_b128 v[36:39], v60, s[4:7], 0 offen offset:144 +; SDAG-GFX1100-NEXT: buffer_load_b128 v[40:43], v60, s[4:7], 0 offen offset:160 +; SDAG-GFX1100-NEXT: buffer_load_b128 v[44:47], v60, s[4:7], 0 offen offset:176 +; SDAG-GFX1100-NEXT: buffer_load_b128 v[48:51], v60, s[4:7], 0 offen offset:192 +; SDAG-GFX1100-NEXT: buffer_load_b128 v[52:55], v60, s[4:7], 0 offen offset:208 +; SDAG-GFX1100-NEXT: buffer_load_b128 v[56:59], v60, s[4:7], 0 offen offset:224 +; SDAG-GFX1100-NEXT: buffer_load_b128 v[60:63], v60, s[4:7], 0 offen offset:240 +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX1100-NEXT: buffer_store_b128 v[0:3], v64, s[12:15], 0 offen +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(14) +; SDAG-GFX1100-NEXT: buffer_store_b128 v[4:7], v64, s[12:15], 0 offen offset:16 +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(13) +; SDAG-GFX1100-NEXT: buffer_store_b128 v[8:11], v64, s[12:15], 0 offen offset:32 +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(12) +; SDAG-GFX1100-NEXT: buffer_store_b128 v[12:15], v64, s[12:15], 0 offen offset:48 +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(11) +; SDAG-GFX1100-NEXT: buffer_store_b128 v[16:19], v64, s[12:15], 0 offen offset:64 +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(10) +; SDAG-GFX1100-NEXT: buffer_store_b128 v[20:23], v64, s[12:15], 0 offen offset:80 +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(9) +; SDAG-GFX1100-NEXT: buffer_store_b128 v[24:27], v64, s[12:15], 0 offen offset:96 +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(8) +; SDAG-GFX1100-NEXT: buffer_store_b128 v[28:31], v64, s[12:15], 0 offen offset:112 +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(7) +; SDAG-GFX1100-NEXT: buffer_store_b128 v[32:35], v64, s[12:15], 0 offen offset:128 +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(6) +; SDAG-GFX1100-NEXT: buffer_store_b128 v[36:39], v64, s[12:15], 0 offen offset:144 +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(5) +; SDAG-GFX1100-NEXT: buffer_store_b128 v[40:43], v64, s[12:15], 0 offen offset:160 +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(4) +; SDAG-GFX1100-NEXT: buffer_store_b128 v[44:47], v64, s[12:15], 0 offen offset:176 +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(3) +; SDAG-GFX1100-NEXT: buffer_store_b128 v[48:51], v64, s[12:15], 0 offen offset:192 +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(2) +; SDAG-GFX1100-NEXT: buffer_store_b128 v[52:55], v64, s[12:15], 0 offen offset:208 +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(1) +; SDAG-GFX1100-NEXT: buffer_store_b128 v[56:59], v64, s[12:15], 0 offen offset:224 +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) +; SDAG-GFX1100-NEXT: buffer_store_b128 v[60:63], v64, s[12:15], 0 offen offset:240 +; SDAG-GFX1100-NEXT: s_cbranch_scc1 .LBB0_1 +; SDAG-GFX1100-NEXT: ; %bb.2: ; %memcpy-split +; SDAG-GFX1100-NEXT: s_endpgm +; +; GISEL-GFX942-LABEL: memcpy_known: +; GISEL-GFX942: ; %bb.0: +; GISEL-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GISEL-GFX942-NEXT: s_load_dword s11, s[4:5], 0x34 +; GISEL-GFX942-NEXT: s_mov_b32 s7, 0 +; GISEL-GFX942-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x44 +; GISEL-GFX942-NEXT: s_mov_b32 s8, s7 +; GISEL-GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX942-NEXT: s_mov_b32 s6, s1 +; GISEL-GFX942-NEXT: s_mov_b32 s9, s2 +; GISEL-GFX942-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] +; GISEL-GFX942-NEXT: s_mov_b32 s6, s3 +; GISEL-GFX942-NEXT: s_load_dword s3, s[4:5], 0x54 +; GISEL-GFX942-NEXT: s_mov_b32 s10, s7 +; GISEL-GFX942-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] +; GISEL-GFX942-NEXT: s_mov_b32 s6, s13 +; GISEL-GFX942-NEXT: s_mov_b32 s4, s7 +; GISEL-GFX942-NEXT: s_mov_b32 s5, s14 +; GISEL-GFX942-NEXT: s_mov_b32 s16, 0 +; GISEL-GFX942-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; GISEL-GFX942-NEXT: s_mov_b32 s6, s15 +; GISEL-GFX942-NEXT: s_mov_b32 s2, s7 +; GISEL-GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX942-NEXT: s_or_b64 s[6:7], s[6:7], s[2:3] +; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, 0x2000 +; GISEL-GFX942-NEXT: v_mov_b32_e32 v1, s16 +; GISEL-GFX942-NEXT: .LBB0_1: ; %load-store-loop +; GISEL-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s0, v1 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v62, s[8:11], 0 offen +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v62, s[8:11], 0 offen offset:16 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:32 +; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1 +; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1 +; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a0, v13 ; Reload Reuse +; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a1, v12 ; Reload Reuse +; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a2, v11 ; Reload Reuse +; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a3, v10 ; Reload Reuse +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v62, s[8:11], 0 offen offset:48 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v62, s[8:11], 0 offen offset:64 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v62, s[8:11], 0 offen offset:80 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[26:29], v62, s[8:11], 0 offen offset:96 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[30:33], v62, s[8:11], 0 offen offset:112 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[34:37], v62, s[8:11], 0 offen offset:128 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[38:41], v62, s[8:11], 0 offen offset:144 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[42:45], v62, s[8:11], 0 offen offset:160 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[46:49], v62, s[8:11], 0 offen offset:176 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v62, s[8:11], 0 offen offset:192 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v62, s[8:11], 0 offen offset:208 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v62, s[8:11], 0 offen offset:224 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:240 +; GISEL-GFX942-NEXT: s_nop 0 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(2) +; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a4, v13 ; Reload Reuse +; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse +; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v4, a1 ; Reload Reuse +; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse +; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v2, a3 ; Reload Reuse +; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a5, v12 ; Reload Reuse +; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a6, v11 ; Reload Reuse +; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a7, v10 ; Reload Reuse +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:32 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v63, s[4:7], 0 offen offset:96 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v63, s[4:7], 0 offen offset:112 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v63, s[4:7], 0 offen offset:128 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v63, s[4:7], 0 offen offset:144 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v63, s[4:7], 0 offen offset:160 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v63, s[4:7], 0 offen offset:176 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224 +; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v5, a4 ; Reload Reuse +; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v4, a5 ; Reload Reuse +; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v3, a6 ; Reload Reuse +; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v2, a7 ; Reload Reuse +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240 +; GISEL-GFX942-NEXT: s_cbranch_vccnz .LBB0_1 +; GISEL-GFX942-NEXT: ; %bb.2: ; %memcpy-split +; GISEL-GFX942-NEXT: s_endpgm +; +; GISEL-GFX1100-LABEL: memcpy_known: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_clause 0x3 +; GISEL-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GISEL-GFX1100-NEXT: s_load_b128 s[8:11], s[4:5], 0x44 +; GISEL-GFX1100-NEXT: s_load_b32 s7, s[4:5], 0x34 +; GISEL-GFX1100-NEXT: s_load_b32 s15, s[4:5], 0x54 +; GISEL-GFX1100-NEXT: s_mov_b32 s17, 0 +; GISEL-GFX1100-NEXT: s_mov_b32 s12, 0 +; GISEL-GFX1100-NEXT: s_mov_b32 s4, s17 +; GISEL-GFX1100-NEXT: s_mov_b32 s6, s17 +; GISEL-GFX1100-NEXT: v_mov_b32_e32 v0, s12 +; GISEL-GFX1100-NEXT: s_mov_b32 s14, s17 +; GISEL-GFX1100-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX1100-NEXT: s_mov_b32 s16, s1 +; GISEL-GFX1100-NEXT: s_mov_b32 s5, s2 +; GISEL-GFX1100-NEXT: s_mov_b32 s2, s17 +; GISEL-GFX1100-NEXT: s_or_b64 s[4:5], s[16:17], s[4:5] +; GISEL-GFX1100-NEXT: s_mov_b32 s16, s3 +; GISEL-GFX1100-NEXT: s_mov_b32 s3, s10 +; GISEL-GFX1100-NEXT: s_or_b64 s[6:7], s[16:17], s[6:7] +; GISEL-GFX1100-NEXT: s_mov_b32 s16, s9 +; GISEL-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GISEL-GFX1100-NEXT: s_or_b64 s[12:13], s[16:17], s[2:3] +; GISEL-GFX1100-NEXT: s_mov_b32 s16, s11 +; GISEL-GFX1100-NEXT: s_or_b64 s[14:15], s[16:17], s[14:15] +; GISEL-GFX1100-NEXT: .LBB0_1: ; %load-store-loop +; GISEL-GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 +; GISEL-GFX1100-NEXT: v_add_nc_u32_e32 v61, s0, v0 +; GISEL-GFX1100-NEXT: v_add_nc_u32_e32 v65, s8, v0 +; GISEL-GFX1100-NEXT: v_add_nc_u32_e32 v0, 0x100, v0 +; GISEL-GFX1100-NEXT: s_clause 0xf +; GISEL-GFX1100-NEXT: buffer_load_b128 v[1:4], v61, s[4:7], 0 offen +; GISEL-GFX1100-NEXT: buffer_load_b128 v[5:8], v61, s[4:7], 0 offen offset:16 +; GISEL-GFX1100-NEXT: buffer_load_b128 v[9:12], v61, s[4:7], 0 offen offset:32 +; GISEL-GFX1100-NEXT: buffer_load_b128 v[13:16], v61, s[4:7], 0 offen offset:48 +; GISEL-GFX1100-NEXT: buffer_load_b128 v[17:20], v61, s[4:7], 0 offen offset:64 +; GISEL-GFX1100-NEXT: buffer_load_b128 v[21:24], v61, s[4:7], 0 offen offset:80 +; GISEL-GFX1100-NEXT: buffer_load_b128 v[25:28], v61, s[4:7], 0 offen offset:96 +; GISEL-GFX1100-NEXT: buffer_load_b128 v[29:32], v61, s[4:7], 0 offen offset:112 +; GISEL-GFX1100-NEXT: buffer_load_b128 v[33:36], v61, s[4:7], 0 offen offset:128 +; GISEL-GFX1100-NEXT: buffer_load_b128 v[37:40], v61, s[4:7], 0 offen offset:144 +; GISEL-GFX1100-NEXT: buffer_load_b128 v[41:44], v61, s[4:7], 0 offen offset:160 +; GISEL-GFX1100-NEXT: buffer_load_b128 v[45:48], v61, s[4:7], 0 offen offset:176 +; GISEL-GFX1100-NEXT: buffer_load_b128 v[49:52], v61, s[4:7], 0 offen offset:192 +; GISEL-GFX1100-NEXT: buffer_load_b128 v[53:56], v61, s[4:7], 0 offen offset:208 +; GISEL-GFX1100-NEXT: buffer_load_b128 v[57:60], v61, s[4:7], 0 offen offset:224 +; GISEL-GFX1100-NEXT: buffer_load_b128 v[61:64], v61, s[4:7], 0 offen offset:240 +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX1100-NEXT: buffer_store_b128 v[1:4], v65, s[12:15], 0 offen +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(14) +; GISEL-GFX1100-NEXT: buffer_store_b128 v[5:8], v65, s[12:15], 0 offen offset:16 +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(13) +; GISEL-GFX1100-NEXT: buffer_store_b128 v[9:12], v65, s[12:15], 0 offen offset:32 +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(12) +; GISEL-GFX1100-NEXT: buffer_store_b128 v[13:16], v65, s[12:15], 0 offen offset:48 +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(11) +; GISEL-GFX1100-NEXT: buffer_store_b128 v[17:20], v65, s[12:15], 0 offen offset:64 +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(10) +; GISEL-GFX1100-NEXT: buffer_store_b128 v[21:24], v65, s[12:15], 0 offen offset:80 +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(9) +; GISEL-GFX1100-NEXT: buffer_store_b128 v[25:28], v65, s[12:15], 0 offen offset:96 +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(8) +; GISEL-GFX1100-NEXT: buffer_store_b128 v[29:32], v65, s[12:15], 0 offen offset:112 +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(7) +; GISEL-GFX1100-NEXT: buffer_store_b128 v[33:36], v65, s[12:15], 0 offen offset:128 +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(6) +; GISEL-GFX1100-NEXT: buffer_store_b128 v[37:40], v65, s[12:15], 0 offen offset:144 +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(5) +; GISEL-GFX1100-NEXT: buffer_store_b128 v[41:44], v65, s[12:15], 0 offen offset:160 +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(4) +; GISEL-GFX1100-NEXT: buffer_store_b128 v[45:48], v65, s[12:15], 0 offen offset:176 +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(3) +; GISEL-GFX1100-NEXT: buffer_store_b128 v[49:52], v65, s[12:15], 0 offen offset:192 +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(2) +; GISEL-GFX1100-NEXT: buffer_store_b128 v[53:56], v65, s[12:15], 0 offen offset:208 +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(1) +; GISEL-GFX1100-NEXT: buffer_store_b128 v[57:60], v65, s[12:15], 0 offen offset:224 +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX1100-NEXT: buffer_store_b128 v[61:64], v65, s[12:15], 0 offen offset:240 +; GISEL-GFX1100-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x2000, v0 +; GISEL-GFX1100-NEXT: s_cbranch_vccnz .LBB0_1 +; GISEL-GFX1100-NEXT: ; %bb.2: ; %memcpy-split +; GISEL-GFX1100-NEXT: s_endpgm + call void @llvm.memcpy.p7.p7.i32(ptr addrspace(7) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 8192, i1 false) + ret void +} + +define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %dst) { +; SDAG-LABEL: memcpy_known_medium: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b32 s7, s24 +; SDAG-NEXT: s_mov_b32 s6, s23 +; SDAG-NEXT: s_mov_b32 s5, s22 +; SDAG-NEXT: s_mov_b32 s4, s21 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SDAG-NEXT: .LBB1_1: ; %load-store-loop +; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-NEXT: v_add_u32_e32 v45, s20, v0 +; SDAG-NEXT: buffer_load_dwordx4 v[1:4], v45, s[16:19], 0 offen +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_load_dwordx4 v[5:8], v45, s[16:19], 0 offen offset:16 +; SDAG-NEXT: buffer_load_dwordx4 v[9:12], v45, s[16:19], 0 offen offset:32 +; SDAG-NEXT: buffer_load_dwordx4 v[13:16], v45, s[16:19], 0 offen offset:48 +; SDAG-NEXT: buffer_load_dwordx4 v[17:20], v45, s[16:19], 0 offen offset:64 +; SDAG-NEXT: buffer_load_dwordx4 v[21:24], v45, s[16:19], 0 offen offset:80 +; SDAG-NEXT: buffer_load_dwordx4 v[25:28], v45, s[16:19], 0 offen offset:96 +; SDAG-NEXT: buffer_load_dwordx4 v[29:32], v45, s[16:19], 0 offen offset:112 +; SDAG-NEXT: buffer_load_dwordx4 v[33:36], v45, s[16:19], 0 offen offset:128 +; SDAG-NEXT: buffer_load_dwordx4 v[48:51], v45, s[16:19], 0 offen offset:144 +; SDAG-NEXT: buffer_load_dwordx4 v[52:55], v45, s[16:19], 0 offen offset:160 +; SDAG-NEXT: buffer_load_dwordx4 v[37:40], v45, s[16:19], 0 offen offset:176 +; SDAG-NEXT: buffer_load_dwordx4 v[41:44], v45, s[16:19], 0 offen offset:192 +; SDAG-NEXT: buffer_load_dwordx4 v[56:59], v45, s[16:19], 0 offen offset:208 +; SDAG-NEXT: buffer_load_dwordx4 v[60:63], v45, s[16:19], 0 offen offset:224 +; SDAG-NEXT: buffer_load_dwordx4 v[1:4], v45, s[16:19], 0 offen offset:240 +; SDAG-NEXT: v_add_u32_e32 v45, s25, v0 +; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0 +; SDAG-NEXT: s_and_b64 vcc, exec, vcc +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SDAG-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: buffer_store_dwordx4 v[1:4], v45, s[4:7], 0 offen +; SDAG-NEXT: buffer_store_dwordx4 v[5:8], v45, s[4:7], 0 offen offset:16 +; SDAG-NEXT: buffer_store_dwordx4 v[9:12], v45, s[4:7], 0 offen offset:32 +; SDAG-NEXT: buffer_store_dwordx4 v[13:16], v45, s[4:7], 0 offen offset:48 +; SDAG-NEXT: buffer_store_dwordx4 v[17:20], v45, s[4:7], 0 offen offset:64 +; SDAG-NEXT: buffer_store_dwordx4 v[21:24], v45, s[4:7], 0 offen offset:80 +; SDAG-NEXT: buffer_store_dwordx4 v[25:28], v45, s[4:7], 0 offen offset:96 +; SDAG-NEXT: buffer_store_dwordx4 v[29:32], v45, s[4:7], 0 offen offset:112 +; SDAG-NEXT: buffer_store_dwordx4 v[33:36], v45, s[4:7], 0 offen offset:128 +; SDAG-NEXT: buffer_store_dwordx4 v[48:51], v45, s[4:7], 0 offen offset:144 +; SDAG-NEXT: buffer_store_dwordx4 v[52:55], v45, s[4:7], 0 offen offset:160 +; SDAG-NEXT: buffer_store_dwordx4 v[37:40], v45, s[4:7], 0 offen offset:176 +; SDAG-NEXT: buffer_store_dwordx4 v[41:44], v45, s[4:7], 0 offen offset:192 +; SDAG-NEXT: buffer_store_dwordx4 v[56:59], v45, s[4:7], 0 offen offset:208 +; SDAG-NEXT: buffer_store_dwordx4 v[60:63], v45, s[4:7], 0 offen offset:224 +; SDAG-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: buffer_store_dwordx4 v[1:4], v45, s[4:7], 0 offen offset:240 +; SDAG-NEXT: s_cbranch_vccnz .LBB1_1 +; SDAG-NEXT: ; %bb.2: ; %memcpy-split +; SDAG-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: memcpy_known_medium: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b32 s8, 0 +; GISEL-NEXT: s_mov_b32 s4, s21 +; GISEL-NEXT: s_mov_b32 s5, s22 +; GISEL-NEXT: s_mov_b32 s6, s23 +; GISEL-NEXT: s_mov_b32 s7, s24 +; GISEL-NEXT: v_mov_b32_e32 v0, s8 +; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GISEL-NEXT: .LBB1_1: ; %load-store-loop +; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GISEL-NEXT: v_add_u32_e32 v45, s20, v0 +; GISEL-NEXT: buffer_load_dwordx4 v[1:4], v45, s[16:19], 0 offen +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_load_dwordx4 v[5:8], v45, s[16:19], 0 offen offset:16 +; GISEL-NEXT: buffer_load_dwordx4 v[9:12], v45, s[16:19], 0 offen offset:32 +; GISEL-NEXT: buffer_load_dwordx4 v[13:16], v45, s[16:19], 0 offen offset:48 +; GISEL-NEXT: buffer_load_dwordx4 v[17:20], v45, s[16:19], 0 offen offset:64 +; GISEL-NEXT: buffer_load_dwordx4 v[21:24], v45, s[16:19], 0 offen offset:80 +; GISEL-NEXT: buffer_load_dwordx4 v[25:28], v45, s[16:19], 0 offen offset:96 +; GISEL-NEXT: buffer_load_dwordx4 v[29:32], v45, s[16:19], 0 offen offset:112 +; GISEL-NEXT: buffer_load_dwordx4 v[33:36], v45, s[16:19], 0 offen offset:128 +; GISEL-NEXT: buffer_load_dwordx4 v[48:51], v45, s[16:19], 0 offen offset:144 +; GISEL-NEXT: buffer_load_dwordx4 v[52:55], v45, s[16:19], 0 offen offset:160 +; GISEL-NEXT: buffer_load_dwordx4 v[37:40], v45, s[16:19], 0 offen offset:176 +; GISEL-NEXT: buffer_load_dwordx4 v[41:44], v45, s[16:19], 0 offen offset:192 +; GISEL-NEXT: buffer_load_dwordx4 v[56:59], v45, s[16:19], 0 offen offset:208 +; GISEL-NEXT: buffer_load_dwordx4 v[60:63], v45, s[16:19], 0 offen offset:224 +; GISEL-NEXT: buffer_load_dwordx4 v[1:4], v45, s[16:19], 0 offen offset:240 +; GISEL-NEXT: v_add_u32_e32 v45, s25, v0 +; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0 +; GISEL-NEXT: s_xor_b64 s[8:9], vcc, -1 +; GISEL-NEXT: s_xor_b64 s[8:9], s[8:9], -1 +; GISEL-NEXT: s_and_b64 vcc, s[8:9], exec +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: buffer_store_dwordx4 v[1:4], v45, s[4:7], 0 offen +; GISEL-NEXT: buffer_store_dwordx4 v[5:8], v45, s[4:7], 0 offen offset:16 +; GISEL-NEXT: buffer_store_dwordx4 v[9:12], v45, s[4:7], 0 offen offset:32 +; GISEL-NEXT: buffer_store_dwordx4 v[13:16], v45, s[4:7], 0 offen offset:48 +; GISEL-NEXT: buffer_store_dwordx4 v[17:20], v45, s[4:7], 0 offen offset:64 +; GISEL-NEXT: buffer_store_dwordx4 v[21:24], v45, s[4:7], 0 offen offset:80 +; GISEL-NEXT: buffer_store_dwordx4 v[25:28], v45, s[4:7], 0 offen offset:96 +; GISEL-NEXT: buffer_store_dwordx4 v[29:32], v45, s[4:7], 0 offen offset:112 +; GISEL-NEXT: buffer_store_dwordx4 v[33:36], v45, s[4:7], 0 offen offset:128 +; GISEL-NEXT: buffer_store_dwordx4 v[48:51], v45, s[4:7], 0 offen offset:144 +; GISEL-NEXT: buffer_store_dwordx4 v[52:55], v45, s[4:7], 0 offen offset:160 +; GISEL-NEXT: buffer_store_dwordx4 v[37:40], v45, s[4:7], 0 offen offset:176 +; GISEL-NEXT: buffer_store_dwordx4 v[41:44], v45, s[4:7], 0 offen offset:192 +; GISEL-NEXT: buffer_store_dwordx4 v[56:59], v45, s[4:7], 0 offen offset:208 +; GISEL-NEXT: buffer_store_dwordx4 v[60:63], v45, s[4:7], 0 offen offset:224 +; GISEL-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: buffer_store_dwordx4 v[1:4], v45, s[4:7], 0 offen offset:240 +; GISEL-NEXT: s_cbranch_vccnz .LBB1_1 +; GISEL-NEXT: ; %bb.2: ; %memcpy-split +; GISEL-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX942-LABEL: memcpy_known_medium: +; SDAG-GFX942: ; %bb.3: +; SDAG-GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; SDAG-GFX942-NEXT: s_load_dword s12, s[4:5], 0x10 +; SDAG-GFX942-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX942-NEXT: s_branch .LBB1_0 +; SDAG-GFX942-NEXT: .p2align 8 +; SDAG-GFX942-NEXT: ; %bb.4: +; SDAG-GFX942-NEXT: .LBB1_0: +; SDAG-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; SDAG-GFX942-NEXT: s_load_dword s13, s[4:5], 0x34 +; SDAG-GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x44 +; SDAG-GFX942-NEXT: s_load_dword s14, s[4:5], 0x54 +; SDAG-GFX942-NEXT: s_mov_b32 s12, 0 +; SDAG-GFX942-NEXT: s_mov_b32 s5, s12 +; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-GFX942-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX942-NEXT: s_mov_b32 s4, s3 +; SDAG-GFX942-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] +; SDAG-GFX942-NEXT: s_mov_b32 s13, s2 +; SDAG-GFX942-NEXT: s_mov_b32 s2, s1 +; SDAG-GFX942-NEXT: s_mov_b32 s3, s12 +; SDAG-GFX942-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] +; SDAG-GFX942-NEXT: s_mov_b32 s13, s14 +; SDAG-GFX942-NEXT: s_mov_b32 s2, s11 +; SDAG-GFX942-NEXT: s_or_b64 s[14:15], s[2:3], s[12:13] +; SDAG-GFX942-NEXT: s_mov_b32 s13, s10 +; SDAG-GFX942-NEXT: s_mov_b32 s2, s9 +; SDAG-GFX942-NEXT: s_or_b64 s[12:13], s[2:3], s[12:13] +; SDAG-GFX942-NEXT: .LBB1_1: ; %load-store-loop +; SDAG-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-GFX942-NEXT: v_add_u32_e32 v1, s0, v0 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v1, s[4:7], 0 offen +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v1, s[4:7], 0 offen offset:16 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[4:7], 0 offen offset:32 +; SDAG-GFX942-NEXT: v_add_u32_e32 v62, s8, v0 +; SDAG-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0 +; SDAG-GFX942-NEXT: s_and_b64 vcc, exec, vcc +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0) +; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a0, v13 ; Reload Reuse +; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a1, v12 ; Reload Reuse +; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a2, v11 ; Reload Reuse +; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a3, v10 ; Reload Reuse +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v1, s[4:7], 0 offen offset:48 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v1, s[4:7], 0 offen offset:64 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v1, s[4:7], 0 offen offset:80 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[26:29], v1, s[4:7], 0 offen offset:96 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[30:33], v1, s[4:7], 0 offen offset:112 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[34:37], v1, s[4:7], 0 offen offset:128 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[38:41], v1, s[4:7], 0 offen offset:144 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[42:45], v1, s[4:7], 0 offen offset:160 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[46:49], v1, s[4:7], 0 offen offset:176 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v1, s[4:7], 0 offen offset:192 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v1, s[4:7], 0 offen offset:208 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v1, s[4:7], 0 offen offset:224 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[4:7], 0 offen offset:240 +; SDAG-GFX942-NEXT: s_nop 0 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v62, s[12:15], 0 offen offset:16 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(2) +; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a4, v13 ; Reload Reuse +; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse +; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v4, a1 ; Reload Reuse +; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse +; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v2, a3 ; Reload Reuse +; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a5, v12 ; Reload Reuse +; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a6, v11 ; Reload Reuse +; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a7, v10 ; Reload Reuse +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen offset:32 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v62, s[12:15], 0 offen offset:48 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v62, s[12:15], 0 offen offset:64 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v62, s[12:15], 0 offen offset:80 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v62, s[12:15], 0 offen offset:96 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v62, s[12:15], 0 offen offset:112 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v62, s[12:15], 0 offen offset:128 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v62, s[12:15], 0 offen offset:144 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v62, s[12:15], 0 offen offset:160 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v62, s[12:15], 0 offen offset:176 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v62, s[12:15], 0 offen offset:192 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v62, s[12:15], 0 offen offset:208 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v62, s[12:15], 0 offen offset:224 +; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v5, a4 ; Reload Reuse +; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v4, a5 ; Reload Reuse +; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v3, a6 ; Reload Reuse +; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v2, a7 ; Reload Reuse +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen offset:240 +; SDAG-GFX942-NEXT: s_cbranch_vccnz .LBB1_1 +; SDAG-GFX942-NEXT: ; %bb.2: ; %memcpy-split +; SDAG-GFX942-NEXT: s_endpgm +; +; SDAG-GFX1100-LABEL: memcpy_known_medium: +; SDAG-GFX1100: ; %bb.0: +; SDAG-GFX1100-NEXT: s_clause 0x3 +; SDAG-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; SDAG-GFX1100-NEXT: s_load_b32 s13, s[4:5], 0x34 +; SDAG-GFX1100-NEXT: s_load_b128 s[8:11], s[4:5], 0x44 +; SDAG-GFX1100-NEXT: s_load_b32 s18, s[4:5], 0x54 +; SDAG-GFX1100-NEXT: s_mov_b32 s12, 0 +; SDAG-GFX1100-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-GFX1100-NEXT: s_mov_b32 s5, s12 +; SDAG-GFX1100-NEXT: s_mov_b32 s15, s12 +; SDAG-GFX1100-NEXT: s_mov_b32 s17, s12 +; SDAG-GFX1100-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX1100-NEXT: s_mov_b32 s4, s3 +; SDAG-GFX1100-NEXT: s_mov_b32 s14, s1 +; SDAG-GFX1100-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] +; SDAG-GFX1100-NEXT: s_mov_b32 s13, s2 +; SDAG-GFX1100-NEXT: s_mov_b32 s16, s11 +; SDAG-GFX1100-NEXT: s_or_b64 s[4:5], s[14:15], s[12:13] +; SDAG-GFX1100-NEXT: s_mov_b32 s13, s18 +; SDAG-GFX1100-NEXT: s_mov_b32 s2, s9 +; SDAG-GFX1100-NEXT: s_or_b64 s[14:15], s[16:17], s[12:13] +; SDAG-GFX1100-NEXT: s_mov_b32 s13, s10 +; SDAG-GFX1100-NEXT: s_mov_b32 s3, s12 +; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; SDAG-GFX1100-NEXT: s_or_b64 s[12:13], s[2:3], s[12:13] +; SDAG-GFX1100-NEXT: .LBB1_1: ; %load-store-loop +; SDAG-GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-GFX1100-NEXT: v_add_nc_u32_e32 v61, s0, v0 +; SDAG-GFX1100-NEXT: v_add_nc_u32_e32 v65, s8, v0 +; SDAG-GFX1100-NEXT: v_add_co_u32 v0, s1, 0x100, v0 +; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s1 +; SDAG-GFX1100-NEXT: s_clause 0xf +; SDAG-GFX1100-NEXT: buffer_load_b128 v[1:4], v61, s[4:7], 0 offen +; SDAG-GFX1100-NEXT: buffer_load_b128 v[5:8], v61, s[4:7], 0 offen offset:16 +; SDAG-GFX1100-NEXT: buffer_load_b128 v[9:12], v61, s[4:7], 0 offen offset:32 +; SDAG-GFX1100-NEXT: buffer_load_b128 v[13:16], v61, s[4:7], 0 offen offset:48 +; SDAG-GFX1100-NEXT: buffer_load_b128 v[17:20], v61, s[4:7], 0 offen offset:64 +; SDAG-GFX1100-NEXT: buffer_load_b128 v[21:24], v61, s[4:7], 0 offen offset:80 +; SDAG-GFX1100-NEXT: buffer_load_b128 v[25:28], v61, s[4:7], 0 offen offset:96 +; SDAG-GFX1100-NEXT: buffer_load_b128 v[29:32], v61, s[4:7], 0 offen offset:112 +; SDAG-GFX1100-NEXT: buffer_load_b128 v[33:36], v61, s[4:7], 0 offen offset:128 +; SDAG-GFX1100-NEXT: buffer_load_b128 v[37:40], v61, s[4:7], 0 offen offset:144 +; SDAG-GFX1100-NEXT: buffer_load_b128 v[41:44], v61, s[4:7], 0 offen offset:160 +; SDAG-GFX1100-NEXT: buffer_load_b128 v[45:48], v61, s[4:7], 0 offen offset:176 +; SDAG-GFX1100-NEXT: buffer_load_b128 v[49:52], v61, s[4:7], 0 offen offset:192 +; SDAG-GFX1100-NEXT: buffer_load_b128 v[53:56], v61, s[4:7], 0 offen offset:208 +; SDAG-GFX1100-NEXT: buffer_load_b128 v[57:60], v61, s[4:7], 0 offen offset:224 +; SDAG-GFX1100-NEXT: buffer_load_b128 v[61:64], v61, s[4:7], 0 offen offset:240 +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX1100-NEXT: buffer_store_b128 v[1:4], v65, s[12:15], 0 offen +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(14) +; SDAG-GFX1100-NEXT: buffer_store_b128 v[5:8], v65, s[12:15], 0 offen offset:16 +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(13) +; SDAG-GFX1100-NEXT: buffer_store_b128 v[9:12], v65, s[12:15], 0 offen offset:32 +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(12) +; SDAG-GFX1100-NEXT: buffer_store_b128 v[13:16], v65, s[12:15], 0 offen offset:48 +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(11) +; SDAG-GFX1100-NEXT: buffer_store_b128 v[17:20], v65, s[12:15], 0 offen offset:64 +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(10) +; SDAG-GFX1100-NEXT: buffer_store_b128 v[21:24], v65, s[12:15], 0 offen offset:80 +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(9) +; SDAG-GFX1100-NEXT: buffer_store_b128 v[25:28], v65, s[12:15], 0 offen offset:96 +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(8) +; SDAG-GFX1100-NEXT: buffer_store_b128 v[29:32], v65, s[12:15], 0 offen offset:112 +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(7) +; SDAG-GFX1100-NEXT: buffer_store_b128 v[33:36], v65, s[12:15], 0 offen offset:128 +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(6) +; SDAG-GFX1100-NEXT: buffer_store_b128 v[37:40], v65, s[12:15], 0 offen offset:144 +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(5) +; SDAG-GFX1100-NEXT: buffer_store_b128 v[41:44], v65, s[12:15], 0 offen offset:160 +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(4) +; SDAG-GFX1100-NEXT: buffer_store_b128 v[45:48], v65, s[12:15], 0 offen offset:176 +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(3) +; SDAG-GFX1100-NEXT: buffer_store_b128 v[49:52], v65, s[12:15], 0 offen offset:192 +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(2) +; SDAG-GFX1100-NEXT: buffer_store_b128 v[53:56], v65, s[12:15], 0 offen offset:208 +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(1) +; SDAG-GFX1100-NEXT: buffer_store_b128 v[57:60], v65, s[12:15], 0 offen offset:224 +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) +; SDAG-GFX1100-NEXT: buffer_store_b128 v[61:64], v65, s[12:15], 0 offen offset:240 +; SDAG-GFX1100-NEXT: s_cbranch_vccnz .LBB1_1 +; SDAG-GFX1100-NEXT: ; %bb.2: ; %memcpy-split +; SDAG-GFX1100-NEXT: s_endpgm +; +; GISEL-GFX942-LABEL: memcpy_known_medium: +; GISEL-GFX942: ; %bb.0: +; GISEL-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GISEL-GFX942-NEXT: s_load_dword s11, s[4:5], 0x34 +; GISEL-GFX942-NEXT: s_mov_b32 s7, 0 +; GISEL-GFX942-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x44 +; GISEL-GFX942-NEXT: s_mov_b32 s8, s7 +; GISEL-GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX942-NEXT: s_mov_b32 s6, s1 +; GISEL-GFX942-NEXT: s_mov_b32 s9, s2 +; GISEL-GFX942-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] +; GISEL-GFX942-NEXT: s_mov_b32 s6, s3 +; GISEL-GFX942-NEXT: s_load_dword s3, s[4:5], 0x54 +; GISEL-GFX942-NEXT: s_mov_b32 s10, s7 +; GISEL-GFX942-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] +; GISEL-GFX942-NEXT: s_mov_b32 s6, s13 +; GISEL-GFX942-NEXT: s_mov_b32 s4, s7 +; GISEL-GFX942-NEXT: s_mov_b32 s5, s14 +; GISEL-GFX942-NEXT: s_mov_b32 s16, 0 +; GISEL-GFX942-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; GISEL-GFX942-NEXT: s_mov_b32 s6, s15 +; GISEL-GFX942-NEXT: s_mov_b32 s2, s7 +; GISEL-GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX942-NEXT: s_or_b64 s[6:7], s[6:7], s[2:3] +; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, s16 +; GISEL-GFX942-NEXT: .LBB1_1: ; %load-store-loop +; GISEL-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GISEL-GFX942-NEXT: v_add_u32_e32 v1, s0, v0 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v1, s[8:11], 0 offen +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v1, s[8:11], 0 offen offset:16 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[8:11], 0 offen offset:32 +; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s12, v0 +; GISEL-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0 +; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], vcc, -1 +; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], s[2:3], -1 +; GISEL-GFX942-NEXT: s_and_b64 vcc, s[2:3], exec +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a0, v13 ; Reload Reuse +; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a1, v12 ; Reload Reuse +; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a2, v11 ; Reload Reuse +; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a3, v10 ; Reload Reuse +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v1, s[8:11], 0 offen offset:48 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v1, s[8:11], 0 offen offset:64 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v1, s[8:11], 0 offen offset:80 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[26:29], v1, s[8:11], 0 offen offset:96 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[30:33], v1, s[8:11], 0 offen offset:112 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[34:37], v1, s[8:11], 0 offen offset:128 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[38:41], v1, s[8:11], 0 offen offset:144 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[42:45], v1, s[8:11], 0 offen offset:160 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[46:49], v1, s[8:11], 0 offen offset:176 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v1, s[8:11], 0 offen offset:192 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v1, s[8:11], 0 offen offset:208 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v1, s[8:11], 0 offen offset:224 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[8:11], 0 offen offset:240 +; GISEL-GFX942-NEXT: s_nop 0 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v62, s[4:7], 0 offen offset:16 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(2) +; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a4, v13 ; Reload Reuse +; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse +; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v4, a1 ; Reload Reuse +; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse +; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v2, a3 ; Reload Reuse +; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a5, v12 ; Reload Reuse +; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a6, v11 ; Reload Reuse +; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a7, v10 ; Reload Reuse +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen offset:32 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v62, s[4:7], 0 offen offset:48 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v62, s[4:7], 0 offen offset:64 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v62, s[4:7], 0 offen offset:80 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v62, s[4:7], 0 offen offset:96 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v62, s[4:7], 0 offen offset:112 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v62, s[4:7], 0 offen offset:128 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v62, s[4:7], 0 offen offset:144 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v62, s[4:7], 0 offen offset:160 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v62, s[4:7], 0 offen offset:176 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v62, s[4:7], 0 offen offset:192 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v62, s[4:7], 0 offen offset:208 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v62, s[4:7], 0 offen offset:224 +; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v5, a4 ; Reload Reuse +; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v4, a5 ; Reload Reuse +; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v3, a6 ; Reload Reuse +; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v2, a7 ; Reload Reuse +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen offset:240 +; GISEL-GFX942-NEXT: s_cbranch_vccnz .LBB1_1 +; GISEL-GFX942-NEXT: ; %bb.2: ; %memcpy-split +; GISEL-GFX942-NEXT: s_endpgm +; +; GISEL-GFX1100-LABEL: memcpy_known_medium: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_clause 0x3 +; GISEL-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GISEL-GFX1100-NEXT: s_load_b128 s[8:11], s[4:5], 0x44 +; GISEL-GFX1100-NEXT: s_load_b32 s7, s[4:5], 0x34 +; GISEL-GFX1100-NEXT: s_load_b32 s15, s[4:5], 0x54 +; GISEL-GFX1100-NEXT: s_mov_b32 s17, 0 +; GISEL-GFX1100-NEXT: s_mov_b32 s12, 0 +; GISEL-GFX1100-NEXT: s_mov_b32 s4, s17 +; GISEL-GFX1100-NEXT: s_mov_b32 s6, s17 +; GISEL-GFX1100-NEXT: v_mov_b32_e32 v0, s12 +; GISEL-GFX1100-NEXT: s_mov_b32 s14, s17 +; GISEL-GFX1100-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX1100-NEXT: s_mov_b32 s16, s1 +; GISEL-GFX1100-NEXT: s_mov_b32 s5, s2 +; GISEL-GFX1100-NEXT: s_mov_b32 s2, s17 +; GISEL-GFX1100-NEXT: s_or_b64 s[4:5], s[16:17], s[4:5] +; GISEL-GFX1100-NEXT: s_mov_b32 s16, s3 +; GISEL-GFX1100-NEXT: s_mov_b32 s3, s10 +; GISEL-GFX1100-NEXT: s_or_b64 s[6:7], s[16:17], s[6:7] +; GISEL-GFX1100-NEXT: s_mov_b32 s16, s9 +; GISEL-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GISEL-GFX1100-NEXT: s_or_b64 s[12:13], s[16:17], s[2:3] +; GISEL-GFX1100-NEXT: s_mov_b32 s16, s11 +; GISEL-GFX1100-NEXT: s_or_b64 s[14:15], s[16:17], s[14:15] +; GISEL-GFX1100-NEXT: .LBB1_1: ; %load-store-loop +; GISEL-GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 +; GISEL-GFX1100-NEXT: v_add_nc_u32_e32 v61, s0, v0 +; GISEL-GFX1100-NEXT: v_add_nc_u32_e32 v65, s8, v0 +; GISEL-GFX1100-NEXT: v_add_co_u32 v0, s1, 0x100, v0 +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX1100-NEXT: s_xor_b32 s1, s1, -1 +; GISEL-GFX1100-NEXT: s_clause 0xf +; GISEL-GFX1100-NEXT: buffer_load_b128 v[1:4], v61, s[4:7], 0 offen +; GISEL-GFX1100-NEXT: buffer_load_b128 v[5:8], v61, s[4:7], 0 offen offset:16 +; GISEL-GFX1100-NEXT: buffer_load_b128 v[9:12], v61, s[4:7], 0 offen offset:32 +; GISEL-GFX1100-NEXT: buffer_load_b128 v[13:16], v61, s[4:7], 0 offen offset:48 +; GISEL-GFX1100-NEXT: buffer_load_b128 v[17:20], v61, s[4:7], 0 offen offset:64 +; GISEL-GFX1100-NEXT: buffer_load_b128 v[21:24], v61, s[4:7], 0 offen offset:80 +; GISEL-GFX1100-NEXT: buffer_load_b128 v[25:28], v61, s[4:7], 0 offen offset:96 +; GISEL-GFX1100-NEXT: buffer_load_b128 v[29:32], v61, s[4:7], 0 offen offset:112 +; GISEL-GFX1100-NEXT: buffer_load_b128 v[33:36], v61, s[4:7], 0 offen offset:128 +; GISEL-GFX1100-NEXT: buffer_load_b128 v[37:40], v61, s[4:7], 0 offen offset:144 +; GISEL-GFX1100-NEXT: buffer_load_b128 v[41:44], v61, s[4:7], 0 offen offset:160 +; GISEL-GFX1100-NEXT: buffer_load_b128 v[45:48], v61, s[4:7], 0 offen offset:176 +; GISEL-GFX1100-NEXT: buffer_load_b128 v[49:52], v61, s[4:7], 0 offen offset:192 +; GISEL-GFX1100-NEXT: buffer_load_b128 v[53:56], v61, s[4:7], 0 offen offset:208 +; GISEL-GFX1100-NEXT: buffer_load_b128 v[57:60], v61, s[4:7], 0 offen offset:224 +; GISEL-GFX1100-NEXT: buffer_load_b128 v[61:64], v61, s[4:7], 0 offen offset:240 +; GISEL-GFX1100-NEXT: s_xor_b32 s1, s1, -1 +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX1100-NEXT: buffer_store_b128 v[1:4], v65, s[12:15], 0 offen +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(14) +; GISEL-GFX1100-NEXT: buffer_store_b128 v[5:8], v65, s[12:15], 0 offen offset:16 +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(13) +; GISEL-GFX1100-NEXT: buffer_store_b128 v[9:12], v65, s[12:15], 0 offen offset:32 +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(12) +; GISEL-GFX1100-NEXT: buffer_store_b128 v[13:16], v65, s[12:15], 0 offen offset:48 +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(11) +; GISEL-GFX1100-NEXT: buffer_store_b128 v[17:20], v65, s[12:15], 0 offen offset:64 +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(10) +; GISEL-GFX1100-NEXT: buffer_store_b128 v[21:24], v65, s[12:15], 0 offen offset:80 +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(9) +; GISEL-GFX1100-NEXT: buffer_store_b128 v[25:28], v65, s[12:15], 0 offen offset:96 +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(8) +; GISEL-GFX1100-NEXT: buffer_store_b128 v[29:32], v65, s[12:15], 0 offen offset:112 +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(7) +; GISEL-GFX1100-NEXT: buffer_store_b128 v[33:36], v65, s[12:15], 0 offen offset:128 +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(6) +; GISEL-GFX1100-NEXT: buffer_store_b128 v[37:40], v65, s[12:15], 0 offen offset:144 +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(5) +; GISEL-GFX1100-NEXT: buffer_store_b128 v[41:44], v65, s[12:15], 0 offen offset:160 +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(4) +; GISEL-GFX1100-NEXT: buffer_store_b128 v[45:48], v65, s[12:15], 0 offen offset:176 +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(3) +; GISEL-GFX1100-NEXT: buffer_store_b128 v[49:52], v65, s[12:15], 0 offen offset:192 +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(2) +; GISEL-GFX1100-NEXT: buffer_store_b128 v[53:56], v65, s[12:15], 0 offen offset:208 +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(1) +; GISEL-GFX1100-NEXT: buffer_store_b128 v[57:60], v65, s[12:15], 0 offen offset:224 +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX1100-NEXT: buffer_store_b128 v[61:64], v65, s[12:15], 0 offen offset:240 +; GISEL-GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s1 +; GISEL-GFX1100-NEXT: s_cbranch_vccnz .LBB1_1 +; GISEL-GFX1100-NEXT: ; %bb.2: ; %memcpy-split +; GISEL-GFX1100-NEXT: s_endpgm + call void @llvm.memcpy.p7.p7.i32(ptr addrspace(7) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 256, i1 false) + ret void +} + +define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %dst) { +; SDAG-LABEL: memcpy_known_small: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v4, s20 +; SDAG-NEXT: buffer_load_dwordx4 v[0:3], v4, s[16:19], 0 offen +; SDAG-NEXT: s_mov_b32 s7, s24 +; SDAG-NEXT: s_mov_b32 s6, s23 +; SDAG-NEXT: s_mov_b32 s5, s22 +; SDAG-NEXT: s_mov_b32 s4, s21 +; SDAG-NEXT: v_mov_b32_e32 v5, s25 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: buffer_store_dwordx4 v[0:3], v5, s[4:7], 0 offen +; SDAG-NEXT: buffer_load_dwordx4 v[0:3], v4, s[16:19], 0 offen offset:16 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: buffer_store_dwordx4 v[0:3], v5, s[4:7], 0 offen offset:16 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: memcpy_known_small: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v4, s20 +; GISEL-NEXT: buffer_load_dwordx4 v[0:3], v4, s[16:19], 0 offen +; GISEL-NEXT: s_mov_b32 s4, s21 +; GISEL-NEXT: s_mov_b32 s5, s22 +; GISEL-NEXT: s_mov_b32 s6, s23 +; GISEL-NEXT: s_mov_b32 s7, s24 +; GISEL-NEXT: v_mov_b32_e32 v5, s25 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: buffer_store_dwordx4 v[0:3], v5, s[4:7], 0 offen +; GISEL-NEXT: buffer_load_dwordx4 v[0:3], v4, s[16:19], 0 offen offset:16 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: buffer_store_dwordx4 v[0:3], v5, s[4:7], 0 offen offset:16 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX942-LABEL: memcpy_known_small: +; SDAG-GFX942: ; %bb.1: +; SDAG-GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; SDAG-GFX942-NEXT: s_load_dword s12, s[4:5], 0x10 +; SDAG-GFX942-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX942-NEXT: s_branch .LBB2_0 +; SDAG-GFX942-NEXT: .p2align 8 +; SDAG-GFX942-NEXT: ; %bb.2: +; SDAG-GFX942-NEXT: .LBB2_0: +; SDAG-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; SDAG-GFX942-NEXT: s_load_dword s13, s[4:5], 0x34 +; SDAG-GFX942-NEXT: s_mov_b32 s12, 0 +; SDAG-GFX942-NEXT: s_mov_b32 s7, s12 +; SDAG-GFX942-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX942-NEXT: s_mov_b32 s6, s3 +; SDAG-GFX942-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13] +; SDAG-GFX942-NEXT: s_mov_b32 s13, s2 +; SDAG-GFX942-NEXT: s_mov_b32 s2, s1 +; SDAG-GFX942-NEXT: s_mov_b32 s3, s12 +; SDAG-GFX942-NEXT: s_or_b64 s[8:9], s[2:3], s[12:13] +; SDAG-GFX942-NEXT: v_mov_b32_e32 v4, s0 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[0:3], v4, s[8:11], 0 offen +; SDAG-GFX942-NEXT: s_load_dword s13, s[4:5], 0x54 +; SDAG-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44 +; SDAG-GFX942-NEXT: s_mov_b32 s5, s12 +; SDAG-GFX942-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX942-NEXT: s_mov_b32 s4, s3 +; SDAG-GFX942-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] +; SDAG-GFX942-NEXT: s_mov_b32 s13, s2 +; SDAG-GFX942-NEXT: s_mov_b32 s2, s1 +; SDAG-GFX942-NEXT: s_mov_b32 s3, s12 +; SDAG-GFX942-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] +; SDAG-GFX942-NEXT: v_mov_b32_e32 v5, s0 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[0:3], v5, s[4:7], 0 offen +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:16 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[0:3], v5, s[4:7], 0 offen offset:16 +; SDAG-GFX942-NEXT: s_endpgm +; +; SDAG-GFX1100-LABEL: memcpy_known_small: +; SDAG-GFX1100: ; %bb.0: +; SDAG-GFX1100-NEXT: s_clause 0x1 +; SDAG-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; SDAG-GFX1100-NEXT: s_load_b32 s13, s[4:5], 0x34 +; SDAG-GFX1100-NEXT: s_mov_b32 s12, 0 +; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; SDAG-GFX1100-NEXT: s_mov_b32 s7, s12 +; SDAG-GFX1100-NEXT: s_mov_b32 s9, s12 +; SDAG-GFX1100-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX1100-NEXT: s_mov_b32 s6, s3 +; SDAG-GFX1100-NEXT: v_mov_b32_e32 v4, s0 +; SDAG-GFX1100-NEXT: s_mov_b32 s8, s1 +; SDAG-GFX1100-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13] +; SDAG-GFX1100-NEXT: s_mov_b32 s13, s2 +; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; SDAG-GFX1100-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; SDAG-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[8:11], 0 offen +; SDAG-GFX1100-NEXT: s_clause 0x1 +; SDAG-GFX1100-NEXT: s_load_b32 s13, s[4:5], 0x54 +; SDAG-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x44 +; SDAG-GFX1100-NEXT: s_mov_b32 s5, s12 +; SDAG-GFX1100-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX1100-NEXT: v_mov_b32_e32 v5, s0 +; SDAG-GFX1100-NEXT: s_mov_b32 s4, s3 +; SDAG-GFX1100-NEXT: s_mov_b32 s3, s12 +; SDAG-GFX1100-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] +; SDAG-GFX1100-NEXT: s_mov_b32 s13, s2 +; SDAG-GFX1100-NEXT: s_mov_b32 s2, s1 +; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; SDAG-GFX1100-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) +; SDAG-GFX1100-NEXT: buffer_store_b128 v[0:3], v5, s[4:7], 0 offen +; SDAG-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[8:11], 0 offen offset:16 +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) +; SDAG-GFX1100-NEXT: buffer_store_b128 v[0:3], v5, s[4:7], 0 offen offset:16 +; SDAG-GFX1100-NEXT: s_endpgm +; +; GISEL-GFX942-LABEL: memcpy_known_small: +; GISEL-GFX942: ; %bb.0: +; GISEL-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GISEL-GFX942-NEXT: s_load_dword s11, s[4:5], 0x34 +; GISEL-GFX942-NEXT: s_mov_b32 s7, 0 +; GISEL-GFX942-NEXT: s_mov_b32 s8, s7 +; GISEL-GFX942-NEXT: s_mov_b32 s10, s7 +; GISEL-GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX942-NEXT: s_mov_b32 s6, s1 +; GISEL-GFX942-NEXT: s_mov_b32 s9, s2 +; GISEL-GFX942-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] +; GISEL-GFX942-NEXT: s_mov_b32 s6, s3 +; GISEL-GFX942-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] +; GISEL-GFX942-NEXT: v_mov_b32_e32 v4, s0 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[0:3], v4, s[8:11], 0 offen +; GISEL-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44 +; GISEL-GFX942-NEXT: s_load_dword s13, s[4:5], 0x54 +; GISEL-GFX942-NEXT: s_mov_b32 s4, s7 +; GISEL-GFX942-NEXT: s_mov_b32 s12, s7 +; GISEL-GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX942-NEXT: s_mov_b32 s6, s1 +; GISEL-GFX942-NEXT: s_mov_b32 s5, s2 +; GISEL-GFX942-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; GISEL-GFX942-NEXT: s_mov_b32 s6, s3 +; GISEL-GFX942-NEXT: s_or_b64 s[6:7], s[6:7], s[12:13] +; GISEL-GFX942-NEXT: v_mov_b32_e32 v5, s0 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[0:3], v5, s[4:7], 0 offen +; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:16 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[0:3], v5, s[4:7], 0 offen offset:16 +; GISEL-GFX942-NEXT: s_endpgm +; +; GISEL-GFX1100-LABEL: memcpy_known_small: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_clause 0x1 +; GISEL-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GISEL-GFX1100-NEXT: s_load_b32 s7, s[4:5], 0x34 +; GISEL-GFX1100-NEXT: s_mov_b32 s13, 0 +; GISEL-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL-GFX1100-NEXT: s_mov_b32 s8, s13 +; GISEL-GFX1100-NEXT: s_mov_b32 s6, s13 +; GISEL-GFX1100-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX1100-NEXT: s_mov_b32 s12, s1 +; GISEL-GFX1100-NEXT: s_mov_b32 s9, s2 +; GISEL-GFX1100-NEXT: v_mov_b32_e32 v4, s0 +; GISEL-GFX1100-NEXT: s_or_b64 s[0:1], s[12:13], s[8:9] +; GISEL-GFX1100-NEXT: s_mov_b32 s12, s3 +; GISEL-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL-GFX1100-NEXT: s_or_b64 s[2:3], s[12:13], s[6:7] +; GISEL-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[0:3], 0 offen +; GISEL-GFX1100-NEXT: s_clause 0x1 +; GISEL-GFX1100-NEXT: s_load_b128 s[8:11], s[4:5], 0x44 +; GISEL-GFX1100-NEXT: s_load_b32 s7, s[4:5], 0x54 +; GISEL-GFX1100-NEXT: s_mov_b32 s4, s13 +; GISEL-GFX1100-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX1100-NEXT: v_mov_b32_e32 v5, s8 +; GISEL-GFX1100-NEXT: s_mov_b32 s12, s9 +; GISEL-GFX1100-NEXT: s_mov_b32 s5, s10 +; GISEL-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GISEL-GFX1100-NEXT: s_or_b64 s[4:5], s[12:13], s[4:5] +; GISEL-GFX1100-NEXT: s_mov_b32 s12, s11 +; GISEL-GFX1100-NEXT: s_or_b64 s[6:7], s[12:13], s[6:7] +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX1100-NEXT: buffer_store_b128 v[0:3], v5, s[4:7], 0 offen +; GISEL-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[0:3], 0 offen offset:16 +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX1100-NEXT: buffer_store_b128 v[0:3], v5, s[4:7], 0 offen offset:16 +; GISEL-GFX1100-NEXT: s_endpgm + call void @llvm.memcpy.p7.p7.i32(ptr addrspace(7) %dst, ptr addrspace(7) %src, i32 32, i1 false) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll new file mode 100644 index 0000000000000..e6c2d1907068f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll @@ -0,0 +1,1730 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -mcpu=gfx900 -amdgpu-lower-buffer-fat-pointers < %s | FileCheck %s +; RUN: opt -S -mcpu=gfx900 -passes=amdgpu-lower-buffer-fat-pointers < %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8" +target triple = "amdgcn--" + +;; memcpy + +declare void @llvm.memcpy.p7.p7.i32(ptr addrspace(7), ptr addrspace(7), i32, i1) +declare void @llvm.memcpy.p1.p7.i32(ptr addrspace(1), ptr addrspace(7), i32, i1) +declare void @llvm.memcpy.p7.p1.i32(ptr addrspace(7), ptr addrspace(1), i32, i1) +declare void @llvm.memcpy.p7.p7.i64(ptr addrspace(7), ptr addrspace(7), i64, i1) +declare void @llvm.memcpy.p3.p7.i32(ptr addrspace(3), ptr addrspace(7), i32, i1) + +define void @memcpy_known(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %dst) { +; CHECK-LABEL: define void @memcpy_known( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], { ptr addrspace(8), i32 } inreg [[DST:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[DST_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 0 +; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 +; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 +; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 +; CHECK-NEXT: br label %[[LOAD_STORE_LOOP:.*]] +; CHECK: [[LOAD_STORE_LOOP]]: +; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ] +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX]] +; CHECK-NEXT: [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_0:%.*]] = shufflevector <4 x i32> [[DOTOFF_0]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_0:%.*]] = shufflevector <64 x i32> poison, <64 x i32> [[DOTEXT_0]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_16:%.*]] = add nuw i32 [[TMP1]], 16 +; CHECK-NEXT: [[DOTOFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_16]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_4:%.*]] = shufflevector <4 x i32> [[DOTOFF_16]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_4:%.*]] = shufflevector <64 x i32> [[DOTPARTS_0]], <64 x i32> [[DOTEXT_4]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_32:%.*]] = add nuw i32 [[TMP1]], 32 +; CHECK-NEXT: [[DOTOFF_32:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_32]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_8:%.*]] = shufflevector <4 x i32> [[DOTOFF_32]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_8:%.*]] = shufflevector <64 x i32> [[DOTPARTS_4]], <64 x i32> [[DOTEXT_8]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_48:%.*]] = add nuw i32 [[TMP1]], 48 +; CHECK-NEXT: [[DOTOFF_48:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_48]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_12:%.*]] = shufflevector <4 x i32> [[DOTOFF_48]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_12:%.*]] = shufflevector <64 x i32> [[DOTPARTS_8]], <64 x i32> [[DOTEXT_12]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_64:%.*]] = add nuw i32 [[TMP1]], 64 +; CHECK-NEXT: [[DOTOFF_64:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_64]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_16:%.*]] = shufflevector <4 x i32> [[DOTOFF_64]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_16:%.*]] = shufflevector <64 x i32> [[DOTPARTS_12]], <64 x i32> [[DOTEXT_16]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_80:%.*]] = add nuw i32 [[TMP1]], 80 +; CHECK-NEXT: [[DOTOFF_80:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_80]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_20:%.*]] = shufflevector <4 x i32> [[DOTOFF_80]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_20:%.*]] = shufflevector <64 x i32> [[DOTPARTS_16]], <64 x i32> [[DOTEXT_20]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_96:%.*]] = add nuw i32 [[TMP1]], 96 +; CHECK-NEXT: [[DOTOFF_96:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_96]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_24:%.*]] = shufflevector <4 x i32> [[DOTOFF_96]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_24:%.*]] = shufflevector <64 x i32> [[DOTPARTS_20]], <64 x i32> [[DOTEXT_24]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_112:%.*]] = add nuw i32 [[TMP1]], 112 +; CHECK-NEXT: [[DOTOFF_112:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_112]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_28:%.*]] = shufflevector <4 x i32> [[DOTOFF_112]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_28:%.*]] = shufflevector <64 x i32> [[DOTPARTS_24]], <64 x i32> [[DOTEXT_28]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_128:%.*]] = add nuw i32 [[TMP1]], 128 +; CHECK-NEXT: [[DOTOFF_128:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_128]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_32:%.*]] = shufflevector <4 x i32> [[DOTOFF_128]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_32:%.*]] = shufflevector <64 x i32> [[DOTPARTS_28]], <64 x i32> [[DOTEXT_32]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_144:%.*]] = add nuw i32 [[TMP1]], 144 +; CHECK-NEXT: [[DOTOFF_144:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_144]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_36:%.*]] = shufflevector <4 x i32> [[DOTOFF_144]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_36:%.*]] = shufflevector <64 x i32> [[DOTPARTS_32]], <64 x i32> [[DOTEXT_36]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_160:%.*]] = add nuw i32 [[TMP1]], 160 +; CHECK-NEXT: [[DOTOFF_160:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_160]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_40:%.*]] = shufflevector <4 x i32> [[DOTOFF_160]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_40:%.*]] = shufflevector <64 x i32> [[DOTPARTS_36]], <64 x i32> [[DOTEXT_40]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_176:%.*]] = add nuw i32 [[TMP1]], 176 +; CHECK-NEXT: [[DOTOFF_176:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_176]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_44:%.*]] = shufflevector <4 x i32> [[DOTOFF_176]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_44:%.*]] = shufflevector <64 x i32> [[DOTPARTS_40]], <64 x i32> [[DOTEXT_44]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_192:%.*]] = add nuw i32 [[TMP1]], 192 +; CHECK-NEXT: [[DOTOFF_192:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_192]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_48:%.*]] = shufflevector <4 x i32> [[DOTOFF_192]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_48:%.*]] = shufflevector <64 x i32> [[DOTPARTS_44]], <64 x i32> [[DOTEXT_48]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_208:%.*]] = add nuw i32 [[TMP1]], 208 +; CHECK-NEXT: [[DOTOFF_208:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_208]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_52:%.*]] = shufflevector <4 x i32> [[DOTOFF_208]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_52:%.*]] = shufflevector <64 x i32> [[DOTPARTS_48]], <64 x i32> [[DOTEXT_52]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_224:%.*]] = add nuw i32 [[TMP1]], 224 +; CHECK-NEXT: [[DOTOFF_224:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_224]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_56:%.*]] = shufflevector <4 x i32> [[DOTOFF_224]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_56:%.*]] = shufflevector <64 x i32> [[DOTPARTS_52]], <64 x i32> [[DOTEXT_56]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_240:%.*]] = add nuw i32 [[TMP1]], 240 +; CHECK-NEXT: [[DOTOFF_240:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_240]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_60:%.*]] = shufflevector <4 x i32> [[DOTOFF_240]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <64 x i32> [[DOTPARTS_56]], <64 x i32> [[DOTEXT_60]], <64 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[DST_OFF]], [[LOOP_INDEX]] +; CHECK-NEXT: [[DOTSLICE_0:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_0]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[TMP3]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_4:%.*]] = add nuw i32 [[TMP3]], 16 +; CHECK-NEXT: [[DOTSLICE_4:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_4]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_4]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_8:%.*]] = add nuw i32 [[TMP3]], 32 +; CHECK-NEXT: [[DOTSLICE_8:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_8]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_8]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_12:%.*]] = add nuw i32 [[TMP3]], 48 +; CHECK-NEXT: [[DOTSLICE_12:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_12]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_12]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_16:%.*]] = add nuw i32 [[TMP3]], 64 +; CHECK-NEXT: [[DOTSLICE_16:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_16]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_16]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_20:%.*]] = add nuw i32 [[TMP3]], 80 +; CHECK-NEXT: [[DOTSLICE_20:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_20]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_20]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_24:%.*]] = add nuw i32 [[TMP3]], 96 +; CHECK-NEXT: [[DOTSLICE_24:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_24]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_24]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_28:%.*]] = add nuw i32 [[TMP3]], 112 +; CHECK-NEXT: [[DOTSLICE_28:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_28]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_28]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_32:%.*]] = add nuw i32 [[TMP3]], 128 +; CHECK-NEXT: [[DOTSLICE_32:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_32]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_32]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_36:%.*]] = add nuw i32 [[TMP3]], 144 +; CHECK-NEXT: [[DOTSLICE_36:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_36]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_36]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_40:%.*]] = add nuw i32 [[TMP3]], 160 +; CHECK-NEXT: [[DOTSLICE_40:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_40]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_40]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_44:%.*]] = add nuw i32 [[TMP3]], 176 +; CHECK-NEXT: [[DOTSLICE_44:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_44]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_44]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_48:%.*]] = add nuw i32 [[TMP3]], 192 +; CHECK-NEXT: [[DOTSLICE_48:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_48]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_48]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_52:%.*]] = add nuw i32 [[TMP3]], 208 +; CHECK-NEXT: [[DOTSLICE_52:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_52]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_52]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_56:%.*]] = add nuw i32 [[TMP3]], 224 +; CHECK-NEXT: [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240 +; CHECK-NEXT: [[DOTSLICE_60:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_60]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0) +; CHECK-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192 +; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]] +; CHECK: [[MEMCPY_SPLIT]]: +; CHECK-NEXT: ret void +; + call void @llvm.memcpy.p7.p7.i32(ptr addrspace(7) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 8192, i1 false) + ret void +} + +define void @memcpy_known_small(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %dst) { +; CHECK-LABEL: define void @memcpy_known_small( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], { ptr addrspace(8), i32 } inreg [[DST:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[DST_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 0 +; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 +; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 +; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[SRC_OFF]], i32 0, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[TMP1]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DST_OFF]], i32 0, i32 0) +; CHECK-NEXT: [[TMP2:%.*]] = add nuw i32 [[SRC_OFF]], 16 +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP2]], i32 0, i32 0) +; CHECK-NEXT: [[TMP4:%.*]] = add nuw i32 [[DST_OFF]], 16 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[TMP3]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP4]], i32 0, i32 0) +; CHECK-NEXT: ret void +; + call void @llvm.memcpy.p7.p7.i32(ptr addrspace(7) %dst, ptr addrspace(7) %src, i32 32, i1 false) + ret void +} + +define void @memcpy_known_byte(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %dst) { +; CHECK-LABEL: define void @memcpy_known_byte( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], { ptr addrspace(8), i32 } inreg [[DST:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[DST_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 0 +; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 +; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 +; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[SRC_OFF]], i32 0, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[TMP1]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DST_OFF]], i32 0, i32 0) +; CHECK-NEXT: ret void +; + call void @llvm.memcpy.p7.p7.i32(ptr addrspace(7) %dst, ptr addrspace(7) %src, i32 1, i1 false) + ret void +} + +define void @memcpy_known_tail(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %dst) { +; CHECK-LABEL: define void @memcpy_known_tail( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], { ptr addrspace(8), i32 } inreg [[DST:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[DST_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 0 +; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 +; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 +; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.raw.ptr.buffer.load.i64(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[SRC_OFF]], i32 0, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i64(i64 [[TMP1]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DST_OFF]], i32 0, i32 0) +; CHECK-NEXT: [[TMP2:%.*]] = add nuw i32 [[SRC_OFF]], 8 +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP2]], i32 0, i32 0) +; CHECK-NEXT: [[TMP4:%.*]] = add nuw i32 [[DST_OFF]], 8 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[TMP3]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP4]], i32 0, i32 0) +; CHECK-NEXT: [[TMP5:%.*]] = add nuw i32 [[SRC_OFF]], 12 +; CHECK-NEXT: [[TMP6:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP5]], i32 0, i32 0) +; CHECK-NEXT: [[TMP7:%.*]] = add nuw i32 [[DST_OFF]], 12 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[TMP6]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP7]], i32 0, i32 0) +; CHECK-NEXT: [[TMP8:%.*]] = add nuw i32 [[SRC_OFF]], 14 +; CHECK-NEXT: [[TMP9:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP8]], i32 0, i32 0) +; CHECK-NEXT: [[TMP10:%.*]] = add nuw i32 [[DST_OFF]], 14 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[TMP9]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP10]], i32 0, i32 0) +; CHECK-NEXT: ret void +; + call void @llvm.memcpy.p7.p7.i32(ptr addrspace(7) %dst, ptr addrspace(7) %src, i32 15, i1 false) + ret void +} + +define void @memcpy_known_i64(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %dst) { +; CHECK-LABEL: define void @memcpy_known_i64( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], { ptr addrspace(8), i32 } inreg [[DST:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[DST_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 0 +; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 +; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 +; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 +; CHECK-NEXT: br label %[[LOAD_STORE_LOOP:.*]] +; CHECK: [[LOAD_STORE_LOOP]]: +; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ] +; CHECK-NEXT: [[LOOP_INDEX_C:%.*]] = trunc i64 [[LOOP_INDEX]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX_C]] +; CHECK-NEXT: [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_0:%.*]] = shufflevector <4 x i32> [[DOTOFF_0]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_0:%.*]] = shufflevector <64 x i32> poison, <64 x i32> [[DOTEXT_0]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_16:%.*]] = add nuw i32 [[TMP1]], 16 +; CHECK-NEXT: [[DOTOFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_16]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_4:%.*]] = shufflevector <4 x i32> [[DOTOFF_16]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_4:%.*]] = shufflevector <64 x i32> [[DOTPARTS_0]], <64 x i32> [[DOTEXT_4]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_32:%.*]] = add nuw i32 [[TMP1]], 32 +; CHECK-NEXT: [[DOTOFF_32:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_32]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_8:%.*]] = shufflevector <4 x i32> [[DOTOFF_32]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_8:%.*]] = shufflevector <64 x i32> [[DOTPARTS_4]], <64 x i32> [[DOTEXT_8]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_48:%.*]] = add nuw i32 [[TMP1]], 48 +; CHECK-NEXT: [[DOTOFF_48:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_48]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_12:%.*]] = shufflevector <4 x i32> [[DOTOFF_48]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_12:%.*]] = shufflevector <64 x i32> [[DOTPARTS_8]], <64 x i32> [[DOTEXT_12]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_64:%.*]] = add nuw i32 [[TMP1]], 64 +; CHECK-NEXT: [[DOTOFF_64:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_64]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_16:%.*]] = shufflevector <4 x i32> [[DOTOFF_64]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_16:%.*]] = shufflevector <64 x i32> [[DOTPARTS_12]], <64 x i32> [[DOTEXT_16]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_80:%.*]] = add nuw i32 [[TMP1]], 80 +; CHECK-NEXT: [[DOTOFF_80:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_80]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_20:%.*]] = shufflevector <4 x i32> [[DOTOFF_80]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_20:%.*]] = shufflevector <64 x i32> [[DOTPARTS_16]], <64 x i32> [[DOTEXT_20]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_96:%.*]] = add nuw i32 [[TMP1]], 96 +; CHECK-NEXT: [[DOTOFF_96:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_96]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_24:%.*]] = shufflevector <4 x i32> [[DOTOFF_96]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_24:%.*]] = shufflevector <64 x i32> [[DOTPARTS_20]], <64 x i32> [[DOTEXT_24]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_112:%.*]] = add nuw i32 [[TMP1]], 112 +; CHECK-NEXT: [[DOTOFF_112:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_112]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_28:%.*]] = shufflevector <4 x i32> [[DOTOFF_112]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_28:%.*]] = shufflevector <64 x i32> [[DOTPARTS_24]], <64 x i32> [[DOTEXT_28]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_128:%.*]] = add nuw i32 [[TMP1]], 128 +; CHECK-NEXT: [[DOTOFF_128:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_128]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_32:%.*]] = shufflevector <4 x i32> [[DOTOFF_128]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_32:%.*]] = shufflevector <64 x i32> [[DOTPARTS_28]], <64 x i32> [[DOTEXT_32]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_144:%.*]] = add nuw i32 [[TMP1]], 144 +; CHECK-NEXT: [[DOTOFF_144:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_144]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_36:%.*]] = shufflevector <4 x i32> [[DOTOFF_144]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_36:%.*]] = shufflevector <64 x i32> [[DOTPARTS_32]], <64 x i32> [[DOTEXT_36]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_160:%.*]] = add nuw i32 [[TMP1]], 160 +; CHECK-NEXT: [[DOTOFF_160:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_160]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_40:%.*]] = shufflevector <4 x i32> [[DOTOFF_160]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_40:%.*]] = shufflevector <64 x i32> [[DOTPARTS_36]], <64 x i32> [[DOTEXT_40]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_176:%.*]] = add nuw i32 [[TMP1]], 176 +; CHECK-NEXT: [[DOTOFF_176:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_176]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_44:%.*]] = shufflevector <4 x i32> [[DOTOFF_176]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_44:%.*]] = shufflevector <64 x i32> [[DOTPARTS_40]], <64 x i32> [[DOTEXT_44]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_192:%.*]] = add nuw i32 [[TMP1]], 192 +; CHECK-NEXT: [[DOTOFF_192:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_192]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_48:%.*]] = shufflevector <4 x i32> [[DOTOFF_192]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_48:%.*]] = shufflevector <64 x i32> [[DOTPARTS_44]], <64 x i32> [[DOTEXT_48]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_208:%.*]] = add nuw i32 [[TMP1]], 208 +; CHECK-NEXT: [[DOTOFF_208:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_208]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_52:%.*]] = shufflevector <4 x i32> [[DOTOFF_208]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_52:%.*]] = shufflevector <64 x i32> [[DOTPARTS_48]], <64 x i32> [[DOTEXT_52]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_224:%.*]] = add nuw i32 [[TMP1]], 224 +; CHECK-NEXT: [[DOTOFF_224:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_224]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_56:%.*]] = shufflevector <4 x i32> [[DOTOFF_224]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_56:%.*]] = shufflevector <64 x i32> [[DOTPARTS_52]], <64 x i32> [[DOTEXT_56]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_240:%.*]] = add nuw i32 [[TMP1]], 240 +; CHECK-NEXT: [[DOTOFF_240:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_240]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_60:%.*]] = shufflevector <4 x i32> [[DOTOFF_240]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <64 x i32> [[DOTPARTS_56]], <64 x i32> [[DOTEXT_60]], <64 x i32> +; CHECK-NEXT: [[LOOP_INDEX_C1:%.*]] = trunc i64 [[LOOP_INDEX]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[DST_OFF]], [[LOOP_INDEX_C1]] +; CHECK-NEXT: [[DOTSLICE_0:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_0]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP3]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_4:%.*]] = add nuw i32 [[TMP3]], 16 +; CHECK-NEXT: [[DOTSLICE_4:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_4]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_4]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_8:%.*]] = add nuw i32 [[TMP3]], 32 +; CHECK-NEXT: [[DOTSLICE_8:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_8]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_8]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_12:%.*]] = add nuw i32 [[TMP3]], 48 +; CHECK-NEXT: [[DOTSLICE_12:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_12]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_12]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_16:%.*]] = add nuw i32 [[TMP3]], 64 +; CHECK-NEXT: [[DOTSLICE_16:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_16]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_16]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_20:%.*]] = add nuw i32 [[TMP3]], 80 +; CHECK-NEXT: [[DOTSLICE_20:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_20]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_20]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_24:%.*]] = add nuw i32 [[TMP3]], 96 +; CHECK-NEXT: [[DOTSLICE_24:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_24]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_24]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_28:%.*]] = add nuw i32 [[TMP3]], 112 +; CHECK-NEXT: [[DOTSLICE_28:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_28]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_28]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_32:%.*]] = add nuw i32 [[TMP3]], 128 +; CHECK-NEXT: [[DOTSLICE_32:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_32]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_32]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_36:%.*]] = add nuw i32 [[TMP3]], 144 +; CHECK-NEXT: [[DOTSLICE_36:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_36]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_36]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_40:%.*]] = add nuw i32 [[TMP3]], 160 +; CHECK-NEXT: [[DOTSLICE_40:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_40]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_40]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_44:%.*]] = add nuw i32 [[TMP3]], 176 +; CHECK-NEXT: [[DOTSLICE_44:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_44]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_44]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_48:%.*]] = add nuw i32 [[TMP3]], 192 +; CHECK-NEXT: [[DOTSLICE_48:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_48]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_48]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_52:%.*]] = add nuw i32 [[TMP3]], 208 +; CHECK-NEXT: [[DOTSLICE_52:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_52]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_52]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_56:%.*]] = add nuw i32 [[TMP3]], 224 +; CHECK-NEXT: [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240 +; CHECK-NEXT: [[DOTSLICE_60:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_60]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0) +; CHECK-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 8192 +; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]] +; CHECK: [[MEMCPY_SPLIT]]: +; CHECK-NEXT: ret void +; + call void @llvm.memcpy.p7.p7.i64(ptr addrspace(7) %dst, ptr addrspace(7) %src, i64 8192, i1 false) + ret void +} + +define void @memcpy_known_i32_volatile(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %dst) { +; CHECK-LABEL: define void @memcpy_known_i32_volatile( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], { ptr addrspace(8), i32 } inreg [[DST:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[DST_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 0 +; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 +; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 +; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[SRC_OFF]], i32 0, i32 -2147483648) +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[TMP1]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DST_OFF]], i32 0, i32 -2147483648) +; CHECK-NEXT: [[TMP2:%.*]] = add nuw i32 [[SRC_OFF]], 16 +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP2]], i32 0, i32 -2147483648) +; CHECK-NEXT: [[TMP4:%.*]] = add nuw i32 [[DST_OFF]], 16 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[TMP3]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP4]], i32 0, i32 -2147483648) +; CHECK-NEXT: ret void +; + call void @llvm.memcpy.p7.p7.i32(ptr addrspace(7) %dst, ptr addrspace(7) %src, i32 32, i1 true) + ret void +} + +define void @memcpy_unknown(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %dst, i32 inreg %length) { +; CHECK-LABEL: define void @memcpy_unknown( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], { ptr addrspace(8), i32 } inreg [[DST:%.*]], i32 inreg [[LENGTH:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[DST_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 0 +; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 +; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 +; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[LENGTH]], 15 +; CHECK-NEXT: [[TMP2:%.*]] = sub i32 [[LENGTH]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[TMP3]], label %[[LOOP_MEMCPY_EXPANSION:.*]], label %[[LOOP_MEMCPY_RESIDUAL_HEADER:.*]] +; CHECK: [[LOOP_MEMCPY_EXPANSION]]: +; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP7:%.*]], %[[LOOP_MEMCPY_EXPANSION]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX]] +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP4]], i32 0, i32 0) +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[DST_OFF]], [[LOOP_INDEX]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[TMP5]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP6]], i32 0, i32 0) +; CHECK-NEXT: [[TMP7]] = add i32 [[LOOP_INDEX]], 16 +; CHECK-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP7]], [[TMP2]] +; CHECK-NEXT: br i1 [[TMP8]], label %[[LOOP_MEMCPY_EXPANSION]], label %[[LOOP_MEMCPY_RESIDUAL_HEADER]] +; CHECK: [[LOOP_MEMCPY_RESIDUAL:.*]]: +; CHECK-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, %[[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP13:%.*]], %[[LOOP_MEMCPY_RESIDUAL]] ] +; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP2]], [[RESIDUAL_LOOP_INDEX]] +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[SRC_OFF]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP10]], i32 0, i32 0) +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[DST_OFF]], [[TMP9]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[TMP11]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP12]], i32 0, i32 0) +; CHECK-NEXT: [[TMP13]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = icmp ult i32 [[TMP13]], [[TMP1]] +; CHECK-NEXT: br i1 [[TMP14]], label %[[LOOP_MEMCPY_RESIDUAL]], label %[[POST_LOOP_MEMCPY_EXPANSION:.*]] +; CHECK: [[POST_LOOP_MEMCPY_EXPANSION]]: +; CHECK-NEXT: ret void +; CHECK: [[LOOP_MEMCPY_RESIDUAL_HEADER]]: +; CHECK-NEXT: [[TMP15:%.*]] = icmp ne i32 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[TMP15]], label %[[LOOP_MEMCPY_RESIDUAL]], label %[[POST_LOOP_MEMCPY_EXPANSION]] +; + call void @llvm.memcpy.p7.p7.i32(ptr addrspace(7) %dst, ptr addrspace(7) %src, i32 %length, i1 false) + ret void +} + +define void @memcpy_known_p1_to_p7(ptr addrspace(1) inreg %src, ptr addrspace(7) inreg %dst) { +; CHECK-LABEL: define void @memcpy_known_p1_to_p7( +; CHECK-SAME: ptr addrspace(1) inreg [[SRC:%.*]], { ptr addrspace(8), i32 } inreg [[DST:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[DST_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 0 +; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 +; CHECK-NEXT: br label %[[LOAD_STORE_LOOP:.*]] +; CHECK: [[LOAD_STORE_LOOP]]: +; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i32 [[LOOP_INDEX]] +; CHECK-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 16 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[DST_OFF]], [[LOOP_INDEX]] +; CHECK-NEXT: [[DOTSLICE_0:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_0]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[TMP3]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_4:%.*]] = add nuw i32 [[TMP3]], 16 +; CHECK-NEXT: [[DOTSLICE_4:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_4]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_4]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_8:%.*]] = add nuw i32 [[TMP3]], 32 +; CHECK-NEXT: [[DOTSLICE_8:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_8]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_8]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_12:%.*]] = add nuw i32 [[TMP3]], 48 +; CHECK-NEXT: [[DOTSLICE_12:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_12]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_12]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_16:%.*]] = add nuw i32 [[TMP3]], 64 +; CHECK-NEXT: [[DOTSLICE_16:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_16]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_16]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_20:%.*]] = add nuw i32 [[TMP3]], 80 +; CHECK-NEXT: [[DOTSLICE_20:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_20]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_20]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_24:%.*]] = add nuw i32 [[TMP3]], 96 +; CHECK-NEXT: [[DOTSLICE_24:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_24]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_24]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_28:%.*]] = add nuw i32 [[TMP3]], 112 +; CHECK-NEXT: [[DOTSLICE_28:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_28]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_28]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_32:%.*]] = add nuw i32 [[TMP3]], 128 +; CHECK-NEXT: [[DOTSLICE_32:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_32]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_32]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_36:%.*]] = add nuw i32 [[TMP3]], 144 +; CHECK-NEXT: [[DOTSLICE_36:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_36]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_36]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_40:%.*]] = add nuw i32 [[TMP3]], 160 +; CHECK-NEXT: [[DOTSLICE_40:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_40]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_40]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_44:%.*]] = add nuw i32 [[TMP3]], 176 +; CHECK-NEXT: [[DOTSLICE_44:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_44]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_44]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_48:%.*]] = add nuw i32 [[TMP3]], 192 +; CHECK-NEXT: [[DOTSLICE_48:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_48]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_48]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_52:%.*]] = add nuw i32 [[TMP3]], 208 +; CHECK-NEXT: [[DOTSLICE_52:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_52]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_52]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_56:%.*]] = add nuw i32 [[TMP3]], 224 +; CHECK-NEXT: [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240 +; CHECK-NEXT: [[DOTSLICE_60:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_60]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0) +; CHECK-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192 +; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]] +; CHECK: [[MEMCPY_SPLIT]]: +; CHECK-NEXT: ret void +; + call void @llvm.memcpy.p7.p1.i32(ptr addrspace(7) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i32 8192, i1 false) + ret void +} + +define void @memcpy_known_p7_to_p1(ptr addrspace(7) inreg %src, ptr addrspace(1) inreg %dst) { +; CHECK-LABEL: define void @memcpy_known_p7_to_p1( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], ptr addrspace(1) inreg [[DST:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 +; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 +; CHECK-NEXT: br label %[[LOAD_STORE_LOOP:.*]] +; CHECK: [[LOAD_STORE_LOOP]]: +; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ] +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX]] +; CHECK-NEXT: [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_0:%.*]] = shufflevector <4 x i32> [[DOTOFF_0]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_0:%.*]] = shufflevector <64 x i32> poison, <64 x i32> [[DOTEXT_0]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_16:%.*]] = add nuw i32 [[TMP1]], 16 +; CHECK-NEXT: [[DOTOFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_16]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_4:%.*]] = shufflevector <4 x i32> [[DOTOFF_16]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_4:%.*]] = shufflevector <64 x i32> [[DOTPARTS_0]], <64 x i32> [[DOTEXT_4]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_32:%.*]] = add nuw i32 [[TMP1]], 32 +; CHECK-NEXT: [[DOTOFF_32:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_32]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_8:%.*]] = shufflevector <4 x i32> [[DOTOFF_32]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_8:%.*]] = shufflevector <64 x i32> [[DOTPARTS_4]], <64 x i32> [[DOTEXT_8]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_48:%.*]] = add nuw i32 [[TMP1]], 48 +; CHECK-NEXT: [[DOTOFF_48:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_48]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_12:%.*]] = shufflevector <4 x i32> [[DOTOFF_48]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_12:%.*]] = shufflevector <64 x i32> [[DOTPARTS_8]], <64 x i32> [[DOTEXT_12]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_64:%.*]] = add nuw i32 [[TMP1]], 64 +; CHECK-NEXT: [[DOTOFF_64:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_64]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_16:%.*]] = shufflevector <4 x i32> [[DOTOFF_64]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_16:%.*]] = shufflevector <64 x i32> [[DOTPARTS_12]], <64 x i32> [[DOTEXT_16]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_80:%.*]] = add nuw i32 [[TMP1]], 80 +; CHECK-NEXT: [[DOTOFF_80:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_80]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_20:%.*]] = shufflevector <4 x i32> [[DOTOFF_80]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_20:%.*]] = shufflevector <64 x i32> [[DOTPARTS_16]], <64 x i32> [[DOTEXT_20]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_96:%.*]] = add nuw i32 [[TMP1]], 96 +; CHECK-NEXT: [[DOTOFF_96:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_96]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_24:%.*]] = shufflevector <4 x i32> [[DOTOFF_96]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_24:%.*]] = shufflevector <64 x i32> [[DOTPARTS_20]], <64 x i32> [[DOTEXT_24]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_112:%.*]] = add nuw i32 [[TMP1]], 112 +; CHECK-NEXT: [[DOTOFF_112:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_112]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_28:%.*]] = shufflevector <4 x i32> [[DOTOFF_112]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_28:%.*]] = shufflevector <64 x i32> [[DOTPARTS_24]], <64 x i32> [[DOTEXT_28]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_128:%.*]] = add nuw i32 [[TMP1]], 128 +; CHECK-NEXT: [[DOTOFF_128:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_128]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_32:%.*]] = shufflevector <4 x i32> [[DOTOFF_128]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_32:%.*]] = shufflevector <64 x i32> [[DOTPARTS_28]], <64 x i32> [[DOTEXT_32]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_144:%.*]] = add nuw i32 [[TMP1]], 144 +; CHECK-NEXT: [[DOTOFF_144:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_144]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_36:%.*]] = shufflevector <4 x i32> [[DOTOFF_144]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_36:%.*]] = shufflevector <64 x i32> [[DOTPARTS_32]], <64 x i32> [[DOTEXT_36]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_160:%.*]] = add nuw i32 [[TMP1]], 160 +; CHECK-NEXT: [[DOTOFF_160:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_160]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_40:%.*]] = shufflevector <4 x i32> [[DOTOFF_160]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_40:%.*]] = shufflevector <64 x i32> [[DOTPARTS_36]], <64 x i32> [[DOTEXT_40]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_176:%.*]] = add nuw i32 [[TMP1]], 176 +; CHECK-NEXT: [[DOTOFF_176:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_176]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_44:%.*]] = shufflevector <4 x i32> [[DOTOFF_176]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_44:%.*]] = shufflevector <64 x i32> [[DOTPARTS_40]], <64 x i32> [[DOTEXT_44]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_192:%.*]] = add nuw i32 [[TMP1]], 192 +; CHECK-NEXT: [[DOTOFF_192:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_192]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_48:%.*]] = shufflevector <4 x i32> [[DOTOFF_192]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_48:%.*]] = shufflevector <64 x i32> [[DOTPARTS_44]], <64 x i32> [[DOTEXT_48]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_208:%.*]] = add nuw i32 [[TMP1]], 208 +; CHECK-NEXT: [[DOTOFF_208:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_208]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_52:%.*]] = shufflevector <4 x i32> [[DOTOFF_208]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_52:%.*]] = shufflevector <64 x i32> [[DOTPARTS_48]], <64 x i32> [[DOTEXT_52]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_224:%.*]] = add nuw i32 [[TMP1]], 224 +; CHECK-NEXT: [[DOTOFF_224:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_224]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_56:%.*]] = shufflevector <4 x i32> [[DOTOFF_224]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_56:%.*]] = shufflevector <64 x i32> [[DOTPARTS_52]], <64 x i32> [[DOTEXT_56]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_240:%.*]] = add nuw i32 [[TMP1]], 240 +; CHECK-NEXT: [[DOTOFF_240:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_240]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_60:%.*]] = shufflevector <4 x i32> [[DOTOFF_240]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <64 x i32> [[DOTPARTS_56]], <64 x i32> [[DOTEXT_60]], <64 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i32 [[LOOP_INDEX]] +; CHECK-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 16 +; CHECK-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192 +; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]] +; CHECK: [[MEMCPY_SPLIT]]: +; CHECK-NEXT: ret void +; + call void @llvm.memcpy.p1.p7.i32(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 8192, i1 false) + ret void +} + +;; This could be the direct-to-LDS intrinsics in a future patch +define void @memcpy_known_p7_to_p3(ptr addrspace(7) inreg %src, ptr addrspace(3) inreg %dst) { +; CHECK-LABEL: define void @memcpy_known_p7_to_p3( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], ptr addrspace(3) inreg [[DST:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 +; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[SRC_OFF]], i32 0, i32 0) +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 0 +; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr addrspace(3) [[TMP2]], align 16 +; CHECK-NEXT: ret void +; + call void @llvm.memcpy.p3.p7.i32(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 16, i1 false) + ret void +} + +define void @memcpy_known_p7_to_p3_byte(ptr addrspace(7) inreg %src, ptr addrspace(3) inreg %dst) { +; CHECK-LABEL: define void @memcpy_known_p7_to_p3_byte( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], ptr addrspace(3) inreg [[DST:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 +; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[SRC_OFF]], i32 0, i32 0) +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 0 +; CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(3) [[TMP2]], align 16 +; CHECK-NEXT: ret void +; + call void @llvm.memcpy.p3.p7.i32(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 4, i1 false) + ret void +} + +define void @memcpy_known_p7_to_p3_long(ptr addrspace(7) inreg %src, ptr addrspace(3) inreg %dst) { +; CHECK-LABEL: define void @memcpy_known_p7_to_p3_long( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], ptr addrspace(3) inreg [[DST:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 +; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 +; CHECK-NEXT: br label %[[LOAD_STORE_LOOP:.*]] +; CHECK: [[LOAD_STORE_LOOP]]: +; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ] +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX]] +; CHECK-NEXT: [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_0:%.*]] = shufflevector <4 x i32> [[DOTOFF_0]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_0:%.*]] = shufflevector <64 x i32> poison, <64 x i32> [[DOTEXT_0]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_16:%.*]] = add nuw i32 [[TMP1]], 16 +; CHECK-NEXT: [[DOTOFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_16]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_4:%.*]] = shufflevector <4 x i32> [[DOTOFF_16]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_4:%.*]] = shufflevector <64 x i32> [[DOTPARTS_0]], <64 x i32> [[DOTEXT_4]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_32:%.*]] = add nuw i32 [[TMP1]], 32 +; CHECK-NEXT: [[DOTOFF_32:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_32]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_8:%.*]] = shufflevector <4 x i32> [[DOTOFF_32]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_8:%.*]] = shufflevector <64 x i32> [[DOTPARTS_4]], <64 x i32> [[DOTEXT_8]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_48:%.*]] = add nuw i32 [[TMP1]], 48 +; CHECK-NEXT: [[DOTOFF_48:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_48]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_12:%.*]] = shufflevector <4 x i32> [[DOTOFF_48]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_12:%.*]] = shufflevector <64 x i32> [[DOTPARTS_8]], <64 x i32> [[DOTEXT_12]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_64:%.*]] = add nuw i32 [[TMP1]], 64 +; CHECK-NEXT: [[DOTOFF_64:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_64]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_16:%.*]] = shufflevector <4 x i32> [[DOTOFF_64]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_16:%.*]] = shufflevector <64 x i32> [[DOTPARTS_12]], <64 x i32> [[DOTEXT_16]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_80:%.*]] = add nuw i32 [[TMP1]], 80 +; CHECK-NEXT: [[DOTOFF_80:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_80]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_20:%.*]] = shufflevector <4 x i32> [[DOTOFF_80]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_20:%.*]] = shufflevector <64 x i32> [[DOTPARTS_16]], <64 x i32> [[DOTEXT_20]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_96:%.*]] = add nuw i32 [[TMP1]], 96 +; CHECK-NEXT: [[DOTOFF_96:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_96]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_24:%.*]] = shufflevector <4 x i32> [[DOTOFF_96]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_24:%.*]] = shufflevector <64 x i32> [[DOTPARTS_20]], <64 x i32> [[DOTEXT_24]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_112:%.*]] = add nuw i32 [[TMP1]], 112 +; CHECK-NEXT: [[DOTOFF_112:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_112]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_28:%.*]] = shufflevector <4 x i32> [[DOTOFF_112]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_28:%.*]] = shufflevector <64 x i32> [[DOTPARTS_24]], <64 x i32> [[DOTEXT_28]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_128:%.*]] = add nuw i32 [[TMP1]], 128 +; CHECK-NEXT: [[DOTOFF_128:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_128]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_32:%.*]] = shufflevector <4 x i32> [[DOTOFF_128]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_32:%.*]] = shufflevector <64 x i32> [[DOTPARTS_28]], <64 x i32> [[DOTEXT_32]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_144:%.*]] = add nuw i32 [[TMP1]], 144 +; CHECK-NEXT: [[DOTOFF_144:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_144]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_36:%.*]] = shufflevector <4 x i32> [[DOTOFF_144]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_36:%.*]] = shufflevector <64 x i32> [[DOTPARTS_32]], <64 x i32> [[DOTEXT_36]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_160:%.*]] = add nuw i32 [[TMP1]], 160 +; CHECK-NEXT: [[DOTOFF_160:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_160]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_40:%.*]] = shufflevector <4 x i32> [[DOTOFF_160]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_40:%.*]] = shufflevector <64 x i32> [[DOTPARTS_36]], <64 x i32> [[DOTEXT_40]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_176:%.*]] = add nuw i32 [[TMP1]], 176 +; CHECK-NEXT: [[DOTOFF_176:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_176]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_44:%.*]] = shufflevector <4 x i32> [[DOTOFF_176]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_44:%.*]] = shufflevector <64 x i32> [[DOTPARTS_40]], <64 x i32> [[DOTEXT_44]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_192:%.*]] = add nuw i32 [[TMP1]], 192 +; CHECK-NEXT: [[DOTOFF_192:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_192]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_48:%.*]] = shufflevector <4 x i32> [[DOTOFF_192]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_48:%.*]] = shufflevector <64 x i32> [[DOTPARTS_44]], <64 x i32> [[DOTEXT_48]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_208:%.*]] = add nuw i32 [[TMP1]], 208 +; CHECK-NEXT: [[DOTOFF_208:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_208]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_52:%.*]] = shufflevector <4 x i32> [[DOTOFF_208]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_52:%.*]] = shufflevector <64 x i32> [[DOTPARTS_48]], <64 x i32> [[DOTEXT_52]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_224:%.*]] = add nuw i32 [[TMP1]], 224 +; CHECK-NEXT: [[DOTOFF_224:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_224]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_56:%.*]] = shufflevector <4 x i32> [[DOTOFF_224]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_56:%.*]] = shufflevector <64 x i32> [[DOTPARTS_52]], <64 x i32> [[DOTEXT_56]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_240:%.*]] = add nuw i32 [[TMP1]], 240 +; CHECK-NEXT: [[DOTOFF_240:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_240]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_60:%.*]] = shufflevector <4 x i32> [[DOTOFF_240]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <64 x i32> [[DOTPARTS_56]], <64 x i32> [[DOTEXT_60]], <64 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[LOOP_INDEX]] +; CHECK-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(3) [[TMP3]], align 16 +; CHECK-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192 +; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]] +; CHECK: [[MEMCPY_SPLIT]]: +; CHECK-NEXT: ret void +; + call void @llvm.memcpy.p3.p7.i32(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 8192, i1 false) + ret void +} + +;; memcpy.inline + +declare void @llvm.memcpy.inline.p7.p7.i32(ptr addrspace(7), ptr addrspace(7), i32, i1) +declare void @llvm.memcpy.inline.p1.p7.i32(ptr addrspace(1), ptr addrspace(7), i32, i1) +declare void @llvm.memcpy.inline.p7.p1.i32(ptr addrspace(7), ptr addrspace(1), i32, i1) +declare void @llvm.memcpy.inline.p7.p7.i64(ptr addrspace(7), ptr addrspace(7), i64, i1) +declare void @llvm.memcpy.inline.p3.p7.i32(ptr addrspace(3), ptr addrspace(7), i32, i1) + +define void @memcpy.inline_known(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %dst) { +; CHECK-LABEL: define void @memcpy.inline_known( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], { ptr addrspace(8), i32 } inreg [[DST:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[DST_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 0 +; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 +; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 +; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 +; CHECK-NEXT: br label %[[LOAD_STORE_LOOP:.*]] +; CHECK: [[LOAD_STORE_LOOP]]: +; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ] +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX]] +; CHECK-NEXT: [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_0:%.*]] = shufflevector <4 x i32> [[DOTOFF_0]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_0:%.*]] = shufflevector <64 x i32> poison, <64 x i32> [[DOTEXT_0]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_16:%.*]] = add nuw i32 [[TMP1]], 16 +; CHECK-NEXT: [[DOTOFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_16]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_4:%.*]] = shufflevector <4 x i32> [[DOTOFF_16]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_4:%.*]] = shufflevector <64 x i32> [[DOTPARTS_0]], <64 x i32> [[DOTEXT_4]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_32:%.*]] = add nuw i32 [[TMP1]], 32 +; CHECK-NEXT: [[DOTOFF_32:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_32]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_8:%.*]] = shufflevector <4 x i32> [[DOTOFF_32]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_8:%.*]] = shufflevector <64 x i32> [[DOTPARTS_4]], <64 x i32> [[DOTEXT_8]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_48:%.*]] = add nuw i32 [[TMP1]], 48 +; CHECK-NEXT: [[DOTOFF_48:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_48]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_12:%.*]] = shufflevector <4 x i32> [[DOTOFF_48]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_12:%.*]] = shufflevector <64 x i32> [[DOTPARTS_8]], <64 x i32> [[DOTEXT_12]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_64:%.*]] = add nuw i32 [[TMP1]], 64 +; CHECK-NEXT: [[DOTOFF_64:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_64]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_16:%.*]] = shufflevector <4 x i32> [[DOTOFF_64]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_16:%.*]] = shufflevector <64 x i32> [[DOTPARTS_12]], <64 x i32> [[DOTEXT_16]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_80:%.*]] = add nuw i32 [[TMP1]], 80 +; CHECK-NEXT: [[DOTOFF_80:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_80]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_20:%.*]] = shufflevector <4 x i32> [[DOTOFF_80]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_20:%.*]] = shufflevector <64 x i32> [[DOTPARTS_16]], <64 x i32> [[DOTEXT_20]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_96:%.*]] = add nuw i32 [[TMP1]], 96 +; CHECK-NEXT: [[DOTOFF_96:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_96]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_24:%.*]] = shufflevector <4 x i32> [[DOTOFF_96]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_24:%.*]] = shufflevector <64 x i32> [[DOTPARTS_20]], <64 x i32> [[DOTEXT_24]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_112:%.*]] = add nuw i32 [[TMP1]], 112 +; CHECK-NEXT: [[DOTOFF_112:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_112]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_28:%.*]] = shufflevector <4 x i32> [[DOTOFF_112]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_28:%.*]] = shufflevector <64 x i32> [[DOTPARTS_24]], <64 x i32> [[DOTEXT_28]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_128:%.*]] = add nuw i32 [[TMP1]], 128 +; CHECK-NEXT: [[DOTOFF_128:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_128]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_32:%.*]] = shufflevector <4 x i32> [[DOTOFF_128]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_32:%.*]] = shufflevector <64 x i32> [[DOTPARTS_28]], <64 x i32> [[DOTEXT_32]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_144:%.*]] = add nuw i32 [[TMP1]], 144 +; CHECK-NEXT: [[DOTOFF_144:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_144]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_36:%.*]] = shufflevector <4 x i32> [[DOTOFF_144]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_36:%.*]] = shufflevector <64 x i32> [[DOTPARTS_32]], <64 x i32> [[DOTEXT_36]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_160:%.*]] = add nuw i32 [[TMP1]], 160 +; CHECK-NEXT: [[DOTOFF_160:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_160]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_40:%.*]] = shufflevector <4 x i32> [[DOTOFF_160]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_40:%.*]] = shufflevector <64 x i32> [[DOTPARTS_36]], <64 x i32> [[DOTEXT_40]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_176:%.*]] = add nuw i32 [[TMP1]], 176 +; CHECK-NEXT: [[DOTOFF_176:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_176]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_44:%.*]] = shufflevector <4 x i32> [[DOTOFF_176]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_44:%.*]] = shufflevector <64 x i32> [[DOTPARTS_40]], <64 x i32> [[DOTEXT_44]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_192:%.*]] = add nuw i32 [[TMP1]], 192 +; CHECK-NEXT: [[DOTOFF_192:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_192]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_48:%.*]] = shufflevector <4 x i32> [[DOTOFF_192]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_48:%.*]] = shufflevector <64 x i32> [[DOTPARTS_44]], <64 x i32> [[DOTEXT_48]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_208:%.*]] = add nuw i32 [[TMP1]], 208 +; CHECK-NEXT: [[DOTOFF_208:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_208]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_52:%.*]] = shufflevector <4 x i32> [[DOTOFF_208]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_52:%.*]] = shufflevector <64 x i32> [[DOTPARTS_48]], <64 x i32> [[DOTEXT_52]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_224:%.*]] = add nuw i32 [[TMP1]], 224 +; CHECK-NEXT: [[DOTOFF_224:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_224]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_56:%.*]] = shufflevector <4 x i32> [[DOTOFF_224]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_56:%.*]] = shufflevector <64 x i32> [[DOTPARTS_52]], <64 x i32> [[DOTEXT_56]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_240:%.*]] = add nuw i32 [[TMP1]], 240 +; CHECK-NEXT: [[DOTOFF_240:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_240]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_60:%.*]] = shufflevector <4 x i32> [[DOTOFF_240]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <64 x i32> [[DOTPARTS_56]], <64 x i32> [[DOTEXT_60]], <64 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[DST_OFF]], [[LOOP_INDEX]] +; CHECK-NEXT: [[DOTSLICE_0:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_0]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[TMP3]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_4:%.*]] = add nuw i32 [[TMP3]], 16 +; CHECK-NEXT: [[DOTSLICE_4:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_4]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_4]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_8:%.*]] = add nuw i32 [[TMP3]], 32 +; CHECK-NEXT: [[DOTSLICE_8:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_8]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_8]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_12:%.*]] = add nuw i32 [[TMP3]], 48 +; CHECK-NEXT: [[DOTSLICE_12:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_12]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_12]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_16:%.*]] = add nuw i32 [[TMP3]], 64 +; CHECK-NEXT: [[DOTSLICE_16:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_16]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_16]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_20:%.*]] = add nuw i32 [[TMP3]], 80 +; CHECK-NEXT: [[DOTSLICE_20:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_20]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_20]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_24:%.*]] = add nuw i32 [[TMP3]], 96 +; CHECK-NEXT: [[DOTSLICE_24:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_24]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_24]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_28:%.*]] = add nuw i32 [[TMP3]], 112 +; CHECK-NEXT: [[DOTSLICE_28:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_28]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_28]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_32:%.*]] = add nuw i32 [[TMP3]], 128 +; CHECK-NEXT: [[DOTSLICE_32:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_32]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_32]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_36:%.*]] = add nuw i32 [[TMP3]], 144 +; CHECK-NEXT: [[DOTSLICE_36:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_36]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_36]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_40:%.*]] = add nuw i32 [[TMP3]], 160 +; CHECK-NEXT: [[DOTSLICE_40:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_40]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_40]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_44:%.*]] = add nuw i32 [[TMP3]], 176 +; CHECK-NEXT: [[DOTSLICE_44:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_44]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_44]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_48:%.*]] = add nuw i32 [[TMP3]], 192 +; CHECK-NEXT: [[DOTSLICE_48:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_48]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_48]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_52:%.*]] = add nuw i32 [[TMP3]], 208 +; CHECK-NEXT: [[DOTSLICE_52:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_52]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_52]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_56:%.*]] = add nuw i32 [[TMP3]], 224 +; CHECK-NEXT: [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240 +; CHECK-NEXT: [[DOTSLICE_60:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_60]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0) +; CHECK-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192 +; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]] +; CHECK: [[MEMCPY_SPLIT]]: +; CHECK-NEXT: ret void +; + call void @llvm.memcpy.inline.p7.p7.i32(ptr addrspace(7) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 8192, i1 false) + ret void +} + +define void @memcpy.inline_known_small(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %dst) { +; CHECK-LABEL: define void @memcpy.inline_known_small( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], { ptr addrspace(8), i32 } inreg [[DST:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[DST_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 0 +; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 +; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 +; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[SRC_OFF]], i32 0, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[TMP1]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DST_OFF]], i32 0, i32 0) +; CHECK-NEXT: [[TMP2:%.*]] = add nuw i32 [[SRC_OFF]], 16 +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP2]], i32 0, i32 0) +; CHECK-NEXT: [[TMP4:%.*]] = add nuw i32 [[DST_OFF]], 16 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[TMP3]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP4]], i32 0, i32 0) +; CHECK-NEXT: ret void +; + call void @llvm.memcpy.inline.p7.p7.i32(ptr addrspace(7) %dst, ptr addrspace(7) %src, i32 32, i1 false) + ret void +} + +define void @memcpy.inline_known_byte(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %dst) { +; CHECK-LABEL: define void @memcpy.inline_known_byte( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], { ptr addrspace(8), i32 } inreg [[DST:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[DST_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 0 +; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 +; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 +; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[SRC_OFF]], i32 0, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[TMP1]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DST_OFF]], i32 0, i32 0) +; CHECK-NEXT: ret void +; + call void @llvm.memcpy.inline.p7.p7.i32(ptr addrspace(7) %dst, ptr addrspace(7) %src, i32 1, i1 false) + ret void +} + +define void @memcpy.inline_known_tail(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %dst) { +; CHECK-LABEL: define void @memcpy.inline_known_tail( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], { ptr addrspace(8), i32 } inreg [[DST:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[DST_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 0 +; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 +; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 +; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.raw.ptr.buffer.load.i64(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[SRC_OFF]], i32 0, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i64(i64 [[TMP1]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DST_OFF]], i32 0, i32 0) +; CHECK-NEXT: [[TMP2:%.*]] = add nuw i32 [[SRC_OFF]], 8 +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP2]], i32 0, i32 0) +; CHECK-NEXT: [[TMP4:%.*]] = add nuw i32 [[DST_OFF]], 8 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[TMP3]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP4]], i32 0, i32 0) +; CHECK-NEXT: [[TMP5:%.*]] = add nuw i32 [[SRC_OFF]], 12 +; CHECK-NEXT: [[TMP6:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP5]], i32 0, i32 0) +; CHECK-NEXT: [[TMP7:%.*]] = add nuw i32 [[DST_OFF]], 12 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[TMP6]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP7]], i32 0, i32 0) +; CHECK-NEXT: [[TMP8:%.*]] = add nuw i32 [[SRC_OFF]], 14 +; CHECK-NEXT: [[TMP9:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP8]], i32 0, i32 0) +; CHECK-NEXT: [[TMP10:%.*]] = add nuw i32 [[DST_OFF]], 14 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[TMP9]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP10]], i32 0, i32 0) +; CHECK-NEXT: ret void +; + call void @llvm.memcpy.inline.p7.p7.i32(ptr addrspace(7) %dst, ptr addrspace(7) %src, i32 15, i1 false) + ret void +} + +define void @memcpy.inline_known_i64(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %dst) { +; CHECK-LABEL: define void @memcpy.inline_known_i64( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], { ptr addrspace(8), i32 } inreg [[DST:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[DST_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 0 +; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 +; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 +; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 +; CHECK-NEXT: br label %[[LOAD_STORE_LOOP:.*]] +; CHECK: [[LOAD_STORE_LOOP]]: +; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ] +; CHECK-NEXT: [[LOOP_INDEX_C:%.*]] = trunc i64 [[LOOP_INDEX]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX_C]] +; CHECK-NEXT: [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_0:%.*]] = shufflevector <4 x i32> [[DOTOFF_0]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_0:%.*]] = shufflevector <64 x i32> poison, <64 x i32> [[DOTEXT_0]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_16:%.*]] = add nuw i32 [[TMP1]], 16 +; CHECK-NEXT: [[DOTOFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_16]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_4:%.*]] = shufflevector <4 x i32> [[DOTOFF_16]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_4:%.*]] = shufflevector <64 x i32> [[DOTPARTS_0]], <64 x i32> [[DOTEXT_4]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_32:%.*]] = add nuw i32 [[TMP1]], 32 +; CHECK-NEXT: [[DOTOFF_32:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_32]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_8:%.*]] = shufflevector <4 x i32> [[DOTOFF_32]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_8:%.*]] = shufflevector <64 x i32> [[DOTPARTS_4]], <64 x i32> [[DOTEXT_8]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_48:%.*]] = add nuw i32 [[TMP1]], 48 +; CHECK-NEXT: [[DOTOFF_48:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_48]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_12:%.*]] = shufflevector <4 x i32> [[DOTOFF_48]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_12:%.*]] = shufflevector <64 x i32> [[DOTPARTS_8]], <64 x i32> [[DOTEXT_12]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_64:%.*]] = add nuw i32 [[TMP1]], 64 +; CHECK-NEXT: [[DOTOFF_64:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_64]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_16:%.*]] = shufflevector <4 x i32> [[DOTOFF_64]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_16:%.*]] = shufflevector <64 x i32> [[DOTPARTS_12]], <64 x i32> [[DOTEXT_16]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_80:%.*]] = add nuw i32 [[TMP1]], 80 +; CHECK-NEXT: [[DOTOFF_80:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_80]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_20:%.*]] = shufflevector <4 x i32> [[DOTOFF_80]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_20:%.*]] = shufflevector <64 x i32> [[DOTPARTS_16]], <64 x i32> [[DOTEXT_20]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_96:%.*]] = add nuw i32 [[TMP1]], 96 +; CHECK-NEXT: [[DOTOFF_96:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_96]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_24:%.*]] = shufflevector <4 x i32> [[DOTOFF_96]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_24:%.*]] = shufflevector <64 x i32> [[DOTPARTS_20]], <64 x i32> [[DOTEXT_24]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_112:%.*]] = add nuw i32 [[TMP1]], 112 +; CHECK-NEXT: [[DOTOFF_112:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_112]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_28:%.*]] = shufflevector <4 x i32> [[DOTOFF_112]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_28:%.*]] = shufflevector <64 x i32> [[DOTPARTS_24]], <64 x i32> [[DOTEXT_28]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_128:%.*]] = add nuw i32 [[TMP1]], 128 +; CHECK-NEXT: [[DOTOFF_128:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_128]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_32:%.*]] = shufflevector <4 x i32> [[DOTOFF_128]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_32:%.*]] = shufflevector <64 x i32> [[DOTPARTS_28]], <64 x i32> [[DOTEXT_32]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_144:%.*]] = add nuw i32 [[TMP1]], 144 +; CHECK-NEXT: [[DOTOFF_144:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_144]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_36:%.*]] = shufflevector <4 x i32> [[DOTOFF_144]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_36:%.*]] = shufflevector <64 x i32> [[DOTPARTS_32]], <64 x i32> [[DOTEXT_36]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_160:%.*]] = add nuw i32 [[TMP1]], 160 +; CHECK-NEXT: [[DOTOFF_160:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_160]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_40:%.*]] = shufflevector <4 x i32> [[DOTOFF_160]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_40:%.*]] = shufflevector <64 x i32> [[DOTPARTS_36]], <64 x i32> [[DOTEXT_40]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_176:%.*]] = add nuw i32 [[TMP1]], 176 +; CHECK-NEXT: [[DOTOFF_176:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_176]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_44:%.*]] = shufflevector <4 x i32> [[DOTOFF_176]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_44:%.*]] = shufflevector <64 x i32> [[DOTPARTS_40]], <64 x i32> [[DOTEXT_44]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_192:%.*]] = add nuw i32 [[TMP1]], 192 +; CHECK-NEXT: [[DOTOFF_192:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_192]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_48:%.*]] = shufflevector <4 x i32> [[DOTOFF_192]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_48:%.*]] = shufflevector <64 x i32> [[DOTPARTS_44]], <64 x i32> [[DOTEXT_48]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_208:%.*]] = add nuw i32 [[TMP1]], 208 +; CHECK-NEXT: [[DOTOFF_208:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_208]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_52:%.*]] = shufflevector <4 x i32> [[DOTOFF_208]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_52:%.*]] = shufflevector <64 x i32> [[DOTPARTS_48]], <64 x i32> [[DOTEXT_52]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_224:%.*]] = add nuw i32 [[TMP1]], 224 +; CHECK-NEXT: [[DOTOFF_224:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_224]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_56:%.*]] = shufflevector <4 x i32> [[DOTOFF_224]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_56:%.*]] = shufflevector <64 x i32> [[DOTPARTS_52]], <64 x i32> [[DOTEXT_56]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_240:%.*]] = add nuw i32 [[TMP1]], 240 +; CHECK-NEXT: [[DOTOFF_240:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_240]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_60:%.*]] = shufflevector <4 x i32> [[DOTOFF_240]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <64 x i32> [[DOTPARTS_56]], <64 x i32> [[DOTEXT_60]], <64 x i32> +; CHECK-NEXT: [[LOOP_INDEX_C1:%.*]] = trunc i64 [[LOOP_INDEX]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[DST_OFF]], [[LOOP_INDEX_C1]] +; CHECK-NEXT: [[DOTSLICE_0:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_0]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP3]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_4:%.*]] = add nuw i32 [[TMP3]], 16 +; CHECK-NEXT: [[DOTSLICE_4:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_4]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_4]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_8:%.*]] = add nuw i32 [[TMP3]], 32 +; CHECK-NEXT: [[DOTSLICE_8:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_8]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_8]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_12:%.*]] = add nuw i32 [[TMP3]], 48 +; CHECK-NEXT: [[DOTSLICE_12:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_12]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_12]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_16:%.*]] = add nuw i32 [[TMP3]], 64 +; CHECK-NEXT: [[DOTSLICE_16:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_16]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_16]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_20:%.*]] = add nuw i32 [[TMP3]], 80 +; CHECK-NEXT: [[DOTSLICE_20:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_20]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_20]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_24:%.*]] = add nuw i32 [[TMP3]], 96 +; CHECK-NEXT: [[DOTSLICE_24:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_24]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_24]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_28:%.*]] = add nuw i32 [[TMP3]], 112 +; CHECK-NEXT: [[DOTSLICE_28:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_28]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_28]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_32:%.*]] = add nuw i32 [[TMP3]], 128 +; CHECK-NEXT: [[DOTSLICE_32:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_32]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_32]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_36:%.*]] = add nuw i32 [[TMP3]], 144 +; CHECK-NEXT: [[DOTSLICE_36:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_36]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_36]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_40:%.*]] = add nuw i32 [[TMP3]], 160 +; CHECK-NEXT: [[DOTSLICE_40:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_40]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_40]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_44:%.*]] = add nuw i32 [[TMP3]], 176 +; CHECK-NEXT: [[DOTSLICE_44:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_44]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_44]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_48:%.*]] = add nuw i32 [[TMP3]], 192 +; CHECK-NEXT: [[DOTSLICE_48:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_48]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_48]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_52:%.*]] = add nuw i32 [[TMP3]], 208 +; CHECK-NEXT: [[DOTSLICE_52:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_52]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_52]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_56:%.*]] = add nuw i32 [[TMP3]], 224 +; CHECK-NEXT: [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240 +; CHECK-NEXT: [[DOTSLICE_60:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_60]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0) +; CHECK-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 8192 +; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]] +; CHECK: [[MEMCPY_SPLIT]]: +; CHECK-NEXT: ret void +; + call void @llvm.memcpy.inline.p7.p7.i64(ptr addrspace(7) %dst, ptr addrspace(7) %src, i64 8192, i1 false) + ret void +} + +define void @memcpy.inline_known_i32_volatile(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %dst) { +; CHECK-LABEL: define void @memcpy.inline_known_i32_volatile( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], { ptr addrspace(8), i32 } inreg [[DST:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[DST_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 0 +; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 +; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 +; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[SRC_OFF]], i32 0, i32 -2147483648) +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[TMP1]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DST_OFF]], i32 0, i32 -2147483648) +; CHECK-NEXT: [[TMP2:%.*]] = add nuw i32 [[SRC_OFF]], 16 +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP2]], i32 0, i32 -2147483648) +; CHECK-NEXT: [[TMP4:%.*]] = add nuw i32 [[DST_OFF]], 16 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[TMP3]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP4]], i32 0, i32 -2147483648) +; CHECK-NEXT: ret void +; + call void @llvm.memcpy.inline.p7.p7.i32(ptr addrspace(7) %dst, ptr addrspace(7) %src, i32 32, i1 true) + ret void +} + +define void @memcpy.inline_unknown(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %dst, i32 inreg %length) { +; CHECK-LABEL: define void @memcpy.inline_unknown( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], { ptr addrspace(8), i32 } inreg [[DST:%.*]], i32 inreg [[LENGTH:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[DST_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 0 +; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 +; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 +; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[LENGTH]], 15 +; CHECK-NEXT: [[TMP2:%.*]] = sub i32 [[LENGTH]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[TMP3]], label %[[LOOP_MEMCPY_EXPANSION:.*]], label %[[LOOP_MEMCPY_RESIDUAL_HEADER:.*]] +; CHECK: [[LOOP_MEMCPY_EXPANSION]]: +; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP7:%.*]], %[[LOOP_MEMCPY_EXPANSION]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX]] +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP4]], i32 0, i32 0) +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[DST_OFF]], [[LOOP_INDEX]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[TMP5]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP6]], i32 0, i32 0) +; CHECK-NEXT: [[TMP7]] = add i32 [[LOOP_INDEX]], 16 +; CHECK-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP7]], [[TMP2]] +; CHECK-NEXT: br i1 [[TMP8]], label %[[LOOP_MEMCPY_EXPANSION]], label %[[LOOP_MEMCPY_RESIDUAL_HEADER]] +; CHECK: [[LOOP_MEMCPY_RESIDUAL:.*]]: +; CHECK-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, %[[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP13:%.*]], %[[LOOP_MEMCPY_RESIDUAL]] ] +; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP2]], [[RESIDUAL_LOOP_INDEX]] +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[SRC_OFF]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP10]], i32 0, i32 0) +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[DST_OFF]], [[TMP9]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[TMP11]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP12]], i32 0, i32 0) +; CHECK-NEXT: [[TMP13]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = icmp ult i32 [[TMP13]], [[TMP1]] +; CHECK-NEXT: br i1 [[TMP14]], label %[[LOOP_MEMCPY_RESIDUAL]], label %[[POST_LOOP_MEMCPY_EXPANSION:.*]] +; CHECK: [[POST_LOOP_MEMCPY_EXPANSION]]: +; CHECK-NEXT: ret void +; CHECK: [[LOOP_MEMCPY_RESIDUAL_HEADER]]: +; CHECK-NEXT: [[TMP15:%.*]] = icmp ne i32 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[TMP15]], label %[[LOOP_MEMCPY_RESIDUAL]], label %[[POST_LOOP_MEMCPY_EXPANSION]] +; + call void @llvm.memcpy.inline.p7.p7.i32(ptr addrspace(7) %dst, ptr addrspace(7) %src, i32 %length, i1 false) + ret void +} + +define void @memcpy.inline_known_p1_to_p7(ptr addrspace(1) inreg %src, ptr addrspace(7) inreg %dst) { +; CHECK-LABEL: define void @memcpy.inline_known_p1_to_p7( +; CHECK-SAME: ptr addrspace(1) inreg [[SRC:%.*]], { ptr addrspace(8), i32 } inreg [[DST:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[DST_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 0 +; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 +; CHECK-NEXT: br label %[[LOAD_STORE_LOOP:.*]] +; CHECK: [[LOAD_STORE_LOOP]]: +; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i32 [[LOOP_INDEX]] +; CHECK-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 16 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[DST_OFF]], [[LOOP_INDEX]] +; CHECK-NEXT: [[DOTSLICE_0:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_0]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[TMP3]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_4:%.*]] = add nuw i32 [[TMP3]], 16 +; CHECK-NEXT: [[DOTSLICE_4:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_4]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_4]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_8:%.*]] = add nuw i32 [[TMP3]], 32 +; CHECK-NEXT: [[DOTSLICE_8:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_8]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_8]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_12:%.*]] = add nuw i32 [[TMP3]], 48 +; CHECK-NEXT: [[DOTSLICE_12:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_12]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_12]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_16:%.*]] = add nuw i32 [[TMP3]], 64 +; CHECK-NEXT: [[DOTSLICE_16:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_16]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_16]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_20:%.*]] = add nuw i32 [[TMP3]], 80 +; CHECK-NEXT: [[DOTSLICE_20:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_20]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_20]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_24:%.*]] = add nuw i32 [[TMP3]], 96 +; CHECK-NEXT: [[DOTSLICE_24:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_24]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_24]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_28:%.*]] = add nuw i32 [[TMP3]], 112 +; CHECK-NEXT: [[DOTSLICE_28:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_28]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_28]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_32:%.*]] = add nuw i32 [[TMP3]], 128 +; CHECK-NEXT: [[DOTSLICE_32:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_32]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_32]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_36:%.*]] = add nuw i32 [[TMP3]], 144 +; CHECK-NEXT: [[DOTSLICE_36:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_36]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_36]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_40:%.*]] = add nuw i32 [[TMP3]], 160 +; CHECK-NEXT: [[DOTSLICE_40:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_40]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_40]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_44:%.*]] = add nuw i32 [[TMP3]], 176 +; CHECK-NEXT: [[DOTSLICE_44:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_44]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_44]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_48:%.*]] = add nuw i32 [[TMP3]], 192 +; CHECK-NEXT: [[DOTSLICE_48:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_48]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_48]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_52:%.*]] = add nuw i32 [[TMP3]], 208 +; CHECK-NEXT: [[DOTSLICE_52:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_52]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_52]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_56:%.*]] = add nuw i32 [[TMP3]], 224 +; CHECK-NEXT: [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0) +; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240 +; CHECK-NEXT: [[DOTSLICE_60:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_60]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0) +; CHECK-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192 +; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]] +; CHECK: [[MEMCPY_SPLIT]]: +; CHECK-NEXT: ret void +; + call void @llvm.memcpy.inline.p7.p1.i32(ptr addrspace(7) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i32 8192, i1 false) + ret void +} + +define void @memcpy.inline_known_p7_to_p1(ptr addrspace(7) inreg %src, ptr addrspace(1) inreg %dst) { +; CHECK-LABEL: define void @memcpy.inline_known_p7_to_p1( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], ptr addrspace(1) inreg [[DST:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 +; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 +; CHECK-NEXT: br label %[[LOAD_STORE_LOOP:.*]] +; CHECK: [[LOAD_STORE_LOOP]]: +; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ] +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX]] +; CHECK-NEXT: [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_0:%.*]] = shufflevector <4 x i32> [[DOTOFF_0]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_0:%.*]] = shufflevector <64 x i32> poison, <64 x i32> [[DOTEXT_0]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_16:%.*]] = add nuw i32 [[TMP1]], 16 +; CHECK-NEXT: [[DOTOFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_16]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_4:%.*]] = shufflevector <4 x i32> [[DOTOFF_16]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_4:%.*]] = shufflevector <64 x i32> [[DOTPARTS_0]], <64 x i32> [[DOTEXT_4]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_32:%.*]] = add nuw i32 [[TMP1]], 32 +; CHECK-NEXT: [[DOTOFF_32:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_32]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_8:%.*]] = shufflevector <4 x i32> [[DOTOFF_32]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_8:%.*]] = shufflevector <64 x i32> [[DOTPARTS_4]], <64 x i32> [[DOTEXT_8]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_48:%.*]] = add nuw i32 [[TMP1]], 48 +; CHECK-NEXT: [[DOTOFF_48:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_48]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_12:%.*]] = shufflevector <4 x i32> [[DOTOFF_48]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_12:%.*]] = shufflevector <64 x i32> [[DOTPARTS_8]], <64 x i32> [[DOTEXT_12]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_64:%.*]] = add nuw i32 [[TMP1]], 64 +; CHECK-NEXT: [[DOTOFF_64:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_64]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_16:%.*]] = shufflevector <4 x i32> [[DOTOFF_64]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_16:%.*]] = shufflevector <64 x i32> [[DOTPARTS_12]], <64 x i32> [[DOTEXT_16]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_80:%.*]] = add nuw i32 [[TMP1]], 80 +; CHECK-NEXT: [[DOTOFF_80:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_80]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_20:%.*]] = shufflevector <4 x i32> [[DOTOFF_80]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_20:%.*]] = shufflevector <64 x i32> [[DOTPARTS_16]], <64 x i32> [[DOTEXT_20]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_96:%.*]] = add nuw i32 [[TMP1]], 96 +; CHECK-NEXT: [[DOTOFF_96:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_96]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_24:%.*]] = shufflevector <4 x i32> [[DOTOFF_96]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_24:%.*]] = shufflevector <64 x i32> [[DOTPARTS_20]], <64 x i32> [[DOTEXT_24]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_112:%.*]] = add nuw i32 [[TMP1]], 112 +; CHECK-NEXT: [[DOTOFF_112:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_112]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_28:%.*]] = shufflevector <4 x i32> [[DOTOFF_112]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_28:%.*]] = shufflevector <64 x i32> [[DOTPARTS_24]], <64 x i32> [[DOTEXT_28]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_128:%.*]] = add nuw i32 [[TMP1]], 128 +; CHECK-NEXT: [[DOTOFF_128:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_128]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_32:%.*]] = shufflevector <4 x i32> [[DOTOFF_128]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_32:%.*]] = shufflevector <64 x i32> [[DOTPARTS_28]], <64 x i32> [[DOTEXT_32]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_144:%.*]] = add nuw i32 [[TMP1]], 144 +; CHECK-NEXT: [[DOTOFF_144:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_144]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_36:%.*]] = shufflevector <4 x i32> [[DOTOFF_144]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_36:%.*]] = shufflevector <64 x i32> [[DOTPARTS_32]], <64 x i32> [[DOTEXT_36]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_160:%.*]] = add nuw i32 [[TMP1]], 160 +; CHECK-NEXT: [[DOTOFF_160:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_160]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_40:%.*]] = shufflevector <4 x i32> [[DOTOFF_160]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_40:%.*]] = shufflevector <64 x i32> [[DOTPARTS_36]], <64 x i32> [[DOTEXT_40]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_176:%.*]] = add nuw i32 [[TMP1]], 176 +; CHECK-NEXT: [[DOTOFF_176:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_176]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_44:%.*]] = shufflevector <4 x i32> [[DOTOFF_176]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_44:%.*]] = shufflevector <64 x i32> [[DOTPARTS_40]], <64 x i32> [[DOTEXT_44]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_192:%.*]] = add nuw i32 [[TMP1]], 192 +; CHECK-NEXT: [[DOTOFF_192:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_192]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_48:%.*]] = shufflevector <4 x i32> [[DOTOFF_192]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_48:%.*]] = shufflevector <64 x i32> [[DOTPARTS_44]], <64 x i32> [[DOTEXT_48]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_208:%.*]] = add nuw i32 [[TMP1]], 208 +; CHECK-NEXT: [[DOTOFF_208:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_208]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_52:%.*]] = shufflevector <4 x i32> [[DOTOFF_208]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_52:%.*]] = shufflevector <64 x i32> [[DOTPARTS_48]], <64 x i32> [[DOTEXT_52]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_224:%.*]] = add nuw i32 [[TMP1]], 224 +; CHECK-NEXT: [[DOTOFF_224:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_224]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_56:%.*]] = shufflevector <4 x i32> [[DOTOFF_224]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_56:%.*]] = shufflevector <64 x i32> [[DOTPARTS_52]], <64 x i32> [[DOTEXT_56]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_240:%.*]] = add nuw i32 [[TMP1]], 240 +; CHECK-NEXT: [[DOTOFF_240:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_240]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_60:%.*]] = shufflevector <4 x i32> [[DOTOFF_240]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <64 x i32> [[DOTPARTS_56]], <64 x i32> [[DOTEXT_60]], <64 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i32 [[LOOP_INDEX]] +; CHECK-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 16 +; CHECK-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192 +; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]] +; CHECK: [[MEMCPY_SPLIT]]: +; CHECK-NEXT: ret void +; + call void @llvm.memcpy.inline.p1.p7.i32(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 8192, i1 false) + ret void +} + +;; This could be the direct-to-LDS intrinsics in a future patch +define void @memcpy.inline_known_p7_to_p3(ptr addrspace(7) inreg %src, ptr addrspace(3) inreg %dst) { +; CHECK-LABEL: define void @memcpy.inline_known_p7_to_p3( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], ptr addrspace(3) inreg [[DST:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 +; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[SRC_OFF]], i32 0, i32 0) +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 0 +; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr addrspace(3) [[TMP2]], align 16 +; CHECK-NEXT: ret void +; + call void @llvm.memcpy.inline.p3.p7.i32(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 16, i1 false) + ret void +} + +define void @memcpy.inline_known_p7_to_p3_byte(ptr addrspace(7) inreg %src, ptr addrspace(3) inreg %dst) { +; CHECK-LABEL: define void @memcpy.inline_known_p7_to_p3_byte( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], ptr addrspace(3) inreg [[DST:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 +; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[SRC_OFF]], i32 0, i32 0) +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 0 +; CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(3) [[TMP2]], align 16 +; CHECK-NEXT: ret void +; + call void @llvm.memcpy.inline.p3.p7.i32(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 4, i1 false) + ret void +} + +define void @memcpy.inline_known_p7_to_p3_long(ptr addrspace(7) inreg %src, ptr addrspace(3) inreg %dst) { +; CHECK-LABEL: define void @memcpy.inline_known_p7_to_p3_long( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], ptr addrspace(3) inreg [[DST:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 +; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 +; CHECK-NEXT: br label %[[LOAD_STORE_LOOP:.*]] +; CHECK: [[LOAD_STORE_LOOP]]: +; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ] +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX]] +; CHECK-NEXT: [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_0:%.*]] = shufflevector <4 x i32> [[DOTOFF_0]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_0:%.*]] = shufflevector <64 x i32> poison, <64 x i32> [[DOTEXT_0]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_16:%.*]] = add nuw i32 [[TMP1]], 16 +; CHECK-NEXT: [[DOTOFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_16]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_4:%.*]] = shufflevector <4 x i32> [[DOTOFF_16]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_4:%.*]] = shufflevector <64 x i32> [[DOTPARTS_0]], <64 x i32> [[DOTEXT_4]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_32:%.*]] = add nuw i32 [[TMP1]], 32 +; CHECK-NEXT: [[DOTOFF_32:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_32]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_8:%.*]] = shufflevector <4 x i32> [[DOTOFF_32]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_8:%.*]] = shufflevector <64 x i32> [[DOTPARTS_4]], <64 x i32> [[DOTEXT_8]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_48:%.*]] = add nuw i32 [[TMP1]], 48 +; CHECK-NEXT: [[DOTOFF_48:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_48]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_12:%.*]] = shufflevector <4 x i32> [[DOTOFF_48]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_12:%.*]] = shufflevector <64 x i32> [[DOTPARTS_8]], <64 x i32> [[DOTEXT_12]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_64:%.*]] = add nuw i32 [[TMP1]], 64 +; CHECK-NEXT: [[DOTOFF_64:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_64]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_16:%.*]] = shufflevector <4 x i32> [[DOTOFF_64]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_16:%.*]] = shufflevector <64 x i32> [[DOTPARTS_12]], <64 x i32> [[DOTEXT_16]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_80:%.*]] = add nuw i32 [[TMP1]], 80 +; CHECK-NEXT: [[DOTOFF_80:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_80]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_20:%.*]] = shufflevector <4 x i32> [[DOTOFF_80]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_20:%.*]] = shufflevector <64 x i32> [[DOTPARTS_16]], <64 x i32> [[DOTEXT_20]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_96:%.*]] = add nuw i32 [[TMP1]], 96 +; CHECK-NEXT: [[DOTOFF_96:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_96]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_24:%.*]] = shufflevector <4 x i32> [[DOTOFF_96]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_24:%.*]] = shufflevector <64 x i32> [[DOTPARTS_20]], <64 x i32> [[DOTEXT_24]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_112:%.*]] = add nuw i32 [[TMP1]], 112 +; CHECK-NEXT: [[DOTOFF_112:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_112]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_28:%.*]] = shufflevector <4 x i32> [[DOTOFF_112]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_28:%.*]] = shufflevector <64 x i32> [[DOTPARTS_24]], <64 x i32> [[DOTEXT_28]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_128:%.*]] = add nuw i32 [[TMP1]], 128 +; CHECK-NEXT: [[DOTOFF_128:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_128]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_32:%.*]] = shufflevector <4 x i32> [[DOTOFF_128]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_32:%.*]] = shufflevector <64 x i32> [[DOTPARTS_28]], <64 x i32> [[DOTEXT_32]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_144:%.*]] = add nuw i32 [[TMP1]], 144 +; CHECK-NEXT: [[DOTOFF_144:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_144]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_36:%.*]] = shufflevector <4 x i32> [[DOTOFF_144]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_36:%.*]] = shufflevector <64 x i32> [[DOTPARTS_32]], <64 x i32> [[DOTEXT_36]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_160:%.*]] = add nuw i32 [[TMP1]], 160 +; CHECK-NEXT: [[DOTOFF_160:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_160]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_40:%.*]] = shufflevector <4 x i32> [[DOTOFF_160]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_40:%.*]] = shufflevector <64 x i32> [[DOTPARTS_36]], <64 x i32> [[DOTEXT_40]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_176:%.*]] = add nuw i32 [[TMP1]], 176 +; CHECK-NEXT: [[DOTOFF_176:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_176]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_44:%.*]] = shufflevector <4 x i32> [[DOTOFF_176]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_44:%.*]] = shufflevector <64 x i32> [[DOTPARTS_40]], <64 x i32> [[DOTEXT_44]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_192:%.*]] = add nuw i32 [[TMP1]], 192 +; CHECK-NEXT: [[DOTOFF_192:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_192]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_48:%.*]] = shufflevector <4 x i32> [[DOTOFF_192]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_48:%.*]] = shufflevector <64 x i32> [[DOTPARTS_44]], <64 x i32> [[DOTEXT_48]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_208:%.*]] = add nuw i32 [[TMP1]], 208 +; CHECK-NEXT: [[DOTOFF_208:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_208]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_52:%.*]] = shufflevector <4 x i32> [[DOTOFF_208]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_52:%.*]] = shufflevector <64 x i32> [[DOTPARTS_48]], <64 x i32> [[DOTEXT_52]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_224:%.*]] = add nuw i32 [[TMP1]], 224 +; CHECK-NEXT: [[DOTOFF_224:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_224]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_56:%.*]] = shufflevector <4 x i32> [[DOTOFF_224]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[DOTPARTS_56:%.*]] = shufflevector <64 x i32> [[DOTPARTS_52]], <64 x i32> [[DOTEXT_56]], <64 x i32> +; CHECK-NEXT: [[DOTOFF_PTR_240:%.*]] = add nuw i32 [[TMP1]], 240 +; CHECK-NEXT: [[DOTOFF_240:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_240]], i32 0, i32 0) +; CHECK-NEXT: [[DOTEXT_60:%.*]] = shufflevector <4 x i32> [[DOTOFF_240]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <64 x i32> [[DOTPARTS_56]], <64 x i32> [[DOTEXT_60]], <64 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[LOOP_INDEX]] +; CHECK-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(3) [[TMP3]], align 16 +; CHECK-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192 +; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]] +; CHECK: [[MEMCPY_SPLIT]]: +; CHECK-NEXT: ret void +; + call void @llvm.memcpy.inline.p3.p7.i32(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 8192, i1 false) + ret void +} + +;; memset + +declare void @llvm.memset.p7.i32(ptr addrspace(7), i8, i32, i1) +declare void @llvm.memset.p7.i64(ptr addrspace(7), i8, i64, i1) + +define void @memset_known(ptr addrspace(7) inreg %ptr) { +; CHECK-LABEL: define void @memset_known( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 +; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 +; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] +; CHECK: [[LOADSTORELOOP]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) +; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 8192 +; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] +; CHECK: [[SPLIT]]: +; CHECK-NEXT: ret void +; + call void @llvm.memset.p7.i32(ptr addrspace(7) noundef nonnull align 16 %ptr, i8 1, i32 8192, i1 false) + ret void +} + +define void @memset_known_small(ptr addrspace(7) inreg %ptr) { +; CHECK-LABEL: define void @memset_known_small( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 +; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 +; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] +; CHECK: [[LOADSTORELOOP]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) +; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 32 +; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] +; CHECK: [[SPLIT]]: +; CHECK-NEXT: ret void +; + call void @llvm.memset.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 32, i1 false) + ret void +} + +define void @memset_known_byte(ptr addrspace(7) inreg %ptr) { +; CHECK-LABEL: define void @memset_known_byte( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 +; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 +; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] +; CHECK: [[LOADSTORELOOP]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) +; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 1 +; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] +; CHECK: [[SPLIT]]: +; CHECK-NEXT: ret void +; + call void @llvm.memset.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 1, i1 false) + ret void +} + +define void @memset_known_tail(ptr addrspace(7) inreg %ptr) { +; CHECK-LABEL: define void @memset_known_tail( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 +; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 +; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] +; CHECK: [[LOADSTORELOOP]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) +; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 15 +; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] +; CHECK: [[SPLIT]]: +; CHECK-NEXT: ret void +; + call void @llvm.memset.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 15, i1 false) + ret void +} + +define void @memset_known_i64(ptr addrspace(7) inreg %ptr) { +; CHECK-LABEL: define void @memset_known_i64( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 +; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 +; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] +; CHECK: [[LOADSTORELOOP]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] +; CHECK-NEXT: [[DOTC:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[DOTC]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) +; CHECK-NEXT: [[TMP3]] = add i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 8192 +; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] +; CHECK: [[SPLIT]]: +; CHECK-NEXT: ret void +; + call void @llvm.memset.p7.i64(ptr addrspace(7) %ptr, i8 1, i64 8192, i1 false) + ret void +} + +define void @memset_known_i32_volatile(ptr addrspace(7) inreg %ptr) { +; CHECK-LABEL: define void @memset_known_i32_volatile( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 +; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 +; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] +; CHECK: [[LOADSTORELOOP]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 -2147483648) +; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 32 +; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] +; CHECK: [[SPLIT]]: +; CHECK-NEXT: ret void +; + call void @llvm.memset.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 32, i1 true) + ret void +} + +define void @memset_unknown(ptr addrspace(7) inreg %ptr, i32 inreg %length) { +; CHECK-LABEL: define void @memset_unknown( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]], i32 inreg [[LENGTH:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 +; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 0, [[LENGTH]] +; CHECK-NEXT: br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] +; CHECK: [[LOADSTORELOOP]]: +; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ] +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[PTR_OFF]], [[TMP2]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP3]], i32 0, i32 0) +; CHECK-NEXT: [[TMP4]] = add i32 [[TMP2]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], [[LENGTH]] +; CHECK-NEXT: br i1 [[TMP5]], label %[[LOADSTORELOOP]], label %[[SPLIT]] +; CHECK: [[SPLIT]]: +; CHECK-NEXT: ret void +; + call void @llvm.memset.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 %length, i1 false) + ret void +} + +;; memset.inline + +declare void @llvm.memset.inline.p7.i32(ptr addrspace(7), i8, i32, i1) +declare void @llvm.memset.inline.p7.i64(ptr addrspace(7), i8, i64, i1) + +define void @memset.inline_known(ptr addrspace(7) inreg %ptr) { +; CHECK-LABEL: define void @memset.inline_known( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 +; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 +; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] +; CHECK: [[LOADSTORELOOP]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) +; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 8192 +; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] +; CHECK: [[SPLIT]]: +; CHECK-NEXT: ret void +; + call void @llvm.memset.inline.p7.i32(ptr addrspace(7) noundef nonnull align 16 %ptr, i8 1, i32 8192, i1 false) + ret void +} + +define void @memset.inline_known_small(ptr addrspace(7) inreg %ptr) { +; CHECK-LABEL: define void @memset.inline_known_small( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 +; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 +; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] +; CHECK: [[LOADSTORELOOP]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) +; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 32 +; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] +; CHECK: [[SPLIT]]: +; CHECK-NEXT: ret void +; + call void @llvm.memset.inline.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 32, i1 false) + ret void +} + +define void @memset.inline_known_byte(ptr addrspace(7) inreg %ptr) { +; CHECK-LABEL: define void @memset.inline_known_byte( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 +; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 +; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] +; CHECK: [[LOADSTORELOOP]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) +; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 1 +; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] +; CHECK: [[SPLIT]]: +; CHECK-NEXT: ret void +; + call void @llvm.memset.inline.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 1, i1 false) + ret void +} + +define void @memset.inline_known_tail(ptr addrspace(7) inreg %ptr) { +; CHECK-LABEL: define void @memset.inline_known_tail( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 +; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 +; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] +; CHECK: [[LOADSTORELOOP]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) +; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 15 +; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] +; CHECK: [[SPLIT]]: +; CHECK-NEXT: ret void +; + call void @llvm.memset.inline.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 15, i1 false) + ret void +} + +define void @memset.inline_known_i64(ptr addrspace(7) inreg %ptr) { +; CHECK-LABEL: define void @memset.inline_known_i64( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 +; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 +; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] +; CHECK: [[LOADSTORELOOP]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] +; CHECK-NEXT: [[DOTC:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[DOTC]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) +; CHECK-NEXT: [[TMP3]] = add i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 8192 +; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] +; CHECK: [[SPLIT]]: +; CHECK-NEXT: ret void +; + call void @llvm.memset.inline.p7.i64(ptr addrspace(7) %ptr, i8 1, i64 8192, i1 false) + ret void +} + +define void @memset.inline_known_i32_volatile(ptr addrspace(7) inreg %ptr) { +; CHECK-LABEL: define void @memset.inline_known_i32_volatile( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 +; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 +; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] +; CHECK: [[LOADSTORELOOP]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 -2147483648) +; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 32 +; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] +; CHECK: [[SPLIT]]: +; CHECK-NEXT: ret void +; + call void @llvm.memset.inline.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 32, i1 true) + ret void +} + +define void @memset.inline_unknown(ptr addrspace(7) inreg %ptr, i32 inreg %length) { +; CHECK-LABEL: define void @memset.inline_unknown( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]], i32 inreg [[LENGTH:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 +; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 0, [[LENGTH]] +; CHECK-NEXT: br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] +; CHECK: [[LOADSTORELOOP]]: +; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ] +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[PTR_OFF]], [[TMP2]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP3]], i32 0, i32 0) +; CHECK-NEXT: [[TMP4]] = add i32 [[TMP2]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], [[LENGTH]] +; CHECK-NEXT: br i1 [[TMP5]], label %[[LOADSTORELOOP]], label %[[SPLIT]] +; CHECK: [[SPLIT]]: +; CHECK-NEXT: ret void +; + call void @llvm.memset.inline.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 %length, i1 false) + ret void +} + +;; memset.pattern + +declare void @llvm.experimental.memset.pattern.p7.i32.i32(ptr addrspace(7), i32, i32, i1) +declare void @llvm.experimental.memset.pattern.p7.i32.i64(ptr addrspace(7), i32, i64, i1) + +define void @memset_pattern_known(ptr addrspace(7) inreg %ptr) { +; CHECK-LABEL: define void @memset_pattern_known( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 +; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 +; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] +; CHECK: [[LOADSTORELOOP]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] +; CHECK-NEXT: [[DOTIDX:%.*]] = mul nsw i32 [[TMP1]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[DOTIDX]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 1, ptr addrspace(8) align 4 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) +; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 8192 +; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] +; CHECK: [[SPLIT]]: +; CHECK-NEXT: ret void +; + call void @llvm.experimental.memset.pattern.p7.i32.i32(ptr addrspace(7) noundef nonnull align 16 %ptr, i32 1, i32 8192, i1 false) + ret void +} + +define void @memset_pattern_known_small(ptr addrspace(7) inreg %ptr) { +; CHECK-LABEL: define void @memset_pattern_known_small( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 +; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 +; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] +; CHECK: [[LOADSTORELOOP]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] +; CHECK-NEXT: [[DOTIDX:%.*]] = mul nsw i32 [[TMP1]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[DOTIDX]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) +; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 32 +; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] +; CHECK: [[SPLIT]]: +; CHECK-NEXT: ret void +; + call void @llvm.experimental.memset.pattern.p7.i32.i32(ptr addrspace(7) %ptr, i32 1, i32 32, i1 false) + ret void +} + +define void @memset_pattern_known_i64(ptr addrspace(7) inreg %ptr) { +; CHECK-LABEL: define void @memset_pattern_known_i64( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 +; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 +; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] +; CHECK: [[LOADSTORELOOP]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] +; CHECK-NEXT: [[DOTC:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: [[DOTIDX:%.*]] = mul nsw i32 [[DOTC]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[DOTIDX]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) +; CHECK-NEXT: [[TMP3]] = add i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 8192 +; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] +; CHECK: [[SPLIT]]: +; CHECK-NEXT: ret void +; + call void @llvm.experimental.memset.pattern.p7.i32.i64(ptr addrspace(7) %ptr, i32 1, i64 8192, i1 false) + ret void +} + +define void @memset_pattern_known_i32_volatile(ptr addrspace(7) inreg %ptr) { +; CHECK-LABEL: define void @memset_pattern_known_i32_volatile( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 +; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 +; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] +; CHECK: [[LOADSTORELOOP]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] +; CHECK-NEXT: [[DOTIDX:%.*]] = mul nsw i32 [[TMP1]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[DOTIDX]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 -2147483648) +; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 32 +; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] +; CHECK: [[SPLIT]]: +; CHECK-NEXT: ret void +; + call void @llvm.experimental.memset.pattern.p7.i32.i32(ptr addrspace(7) %ptr, i32 1, i32 32, i1 true) + ret void +} + +define void @memset_pattern_unknown(ptr addrspace(7) inreg %ptr, i32 inreg %length) { +; CHECK-LABEL: define void @memset_pattern_unknown( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]], i32 inreg [[LENGTH:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 +; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 0, [[LENGTH]] +; CHECK-NEXT: br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] +; CHECK: [[LOADSTORELOOP]]: +; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ] +; CHECK-NEXT: [[DOTIDX:%.*]] = mul nsw i32 [[TMP2]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[PTR_OFF]], [[DOTIDX]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP3]], i32 0, i32 0) +; CHECK-NEXT: [[TMP4]] = add i32 [[TMP2]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], [[LENGTH]] +; CHECK-NEXT: br i1 [[TMP5]], label %[[LOADSTORELOOP]], label %[[SPLIT]] +; CHECK: [[SPLIT]]: +; CHECK-NEXT: ret void +; + call void @llvm.experimental.memset.pattern.p7.i32.i32(ptr addrspace(7) %ptr, i32 1, i32 %length, i1 false) + ret void +}