From 9487b09dee977075226300c0193570ad7ed3d8e8 Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Tue, 9 Jul 2024 18:17:44 +0200 Subject: [PATCH 1/7] [NVPTX] Volta Relaxed/Acquire/Release and Volatile Load/Store Ops --- .../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 28 +- llvm/lib/Target/NVPTX/NVPTX.h | 8 + llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 301 ++++-- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 144 +-- llvm/test/CodeGen/NVPTX/load-store-sm-70.ll | 951 ++++++++++++++++++ llvm/test/CodeGen/NVPTX/load-store.ll | 553 +++++++++- 6 files changed, 1829 insertions(+), 156 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/load-store-sm-70.ll diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp index 380d878c1f532..a004d64c21cc6 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -227,9 +227,33 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum, if (Modifier) { const MCOperand &MO = MI->getOperand(OpNum); int Imm = (int) MO.getImm(); - if (!strcmp(Modifier, "volatile")) { - if (Imm) + if (!strcmp(Modifier, "sem")) { + switch (Imm) { + case NVPTX::PTXLdStInstCode::NotAtomic: + break; + case NVPTX::PTXLdStInstCode::Volatile: O << ".volatile"; + break; + case NVPTX::PTXLdStInstCode::Relaxed: + O << ".relaxed.sys"; + break; + case NVPTX::PTXLdStInstCode::Acquire: + O << ".acquire.sys"; + break; + case NVPTX::PTXLdStInstCode::Release: + O << ".release.sys"; + break; + case NVPTX::PTXLdStInstCode::RelaxedMMIO: + O << ".mmio.relaxed.sys"; + break; + default: + SmallString<256> Msg; + raw_svector_ostream OS(Msg); + OS << "NVPTX LdStCode Printer does not support \"" << Imm + << "\" sem modifier."; + report_fatal_error(OS.str()); + break; + } } else if (!strcmp(Modifier, "addsp")) { switch (Imm) { case NVPTX::PTXLdStInstCode::GLOBAL: diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index b0cb24c63c3ce..3c7167b157025 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -107,6 +107,14 @@ enum LoadStore { }; namespace PTXLdStInstCode { +enum MemorySemantic { + NotAtomic = 0, // PTX calls these: "Weak" + Volatile = 1, + Relaxed = 2, + Acquire = 3, + Release = 4, + RelaxedMMIO = 5 +}; enum AddressSpace { GENERIC = 0, GLOBAL = 1, diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 11193c11ede3b..41c2da89f116d 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -714,6 +714,157 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) { return NVPTX::PTXLdStInstCode::GENERIC; } +static unsigned int getCodeMemorySemantic(MemSDNode *N, + const NVPTXSubtarget *Subtarget) { + AtomicOrdering Ordering = N->getSuccessOrdering(); + auto CodeAddrSpace = getCodeAddrSpace(N); + + // Supports relaxed, acquire, release, weak: + bool hasAtomics = + Subtarget->getPTXVersion() >= 60 && Subtarget->getSmVersion() >= 70; + // Supports mmio: + bool hasRelaxedMMIO = + Subtarget->getPTXVersion() >= 82 && Subtarget->getSmVersion() >= 70; + + // TODO: lowering for SequentiallyConsistent Operations: for now, we error. + // TODO: lowering for AcquireRelease Operations: for now, we error. + // + + // clang-format off + + // Lowering for non-SequentiallyConsistent Operations + // + // | Atomic | Volatile | Statespace | Lowering sm_60- | Lowering sm_70+ | + // |---------|----------|-------------------------------|-----------------|------------------------------------------------------| + // | No | No | All | plain | .weak | + // | No | Yes | Generic / Shared / Global [0] | .volatile | .volatile | + // | No | Yes | Local / Const / Param | plain [1] | .weak [1] | + // | Relaxed | No | Generic / Shared / Global [0] | .volatile | | + // | Other | No | Generic / Shared / Global [0] | Error [2] | | + // | Yes | No | Local / Const / Param | plain [1] | .weak [1] | + // | Relaxed | Yes | Generic / Shared [0] | .volatile | .volatile | + // | Relaxed | Yes | Global [0] | .volatile | .mmio.relaxed.sys (PTX 8.2+) or .volatile (PTX 8.1-) | + // | Relaxed | Yes | Local / Const / Param | plain [1] | .weak [1] | + // | Other | Yes | Generic / Shared / Global [0] | Error [4] | [3] | + + // clang-format on + + // [0]: volatile and atomics are only supported on generic addressing to + // shared or global, or shared, or global. + // MMIO requires generic addressing to global or global, but + // (TODO) we only implement it for global. + + // [1]: TODO: this implementation exhibits PTX Undefined Behavior; it + // fails to preserve the side-effects of atomics and volatile + // accesses in LLVM IR to local / const / param, causing + // well-formed LLVM-IR & CUDA C++ programs to be miscompiled + // in sm_70+. + + if (CodeAddrSpace == NVPTX::PTXLdStInstCode::LOCAL || + CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT || + CodeAddrSpace == NVPTX::PTXLdStInstCode::PARAM) { + return NVPTX::PTXLdStInstCode::NotAtomic; + } + + // [2]: Atomics with Ordering different than Relaxed are not supported on + // sm_60 and older. + if (!(Ordering == AtomicOrdering::NotAtomic || + Ordering == AtomicOrdering::Monotonic) && + !hasAtomics) { + SmallString<256> Msg; + raw_svector_ostream OS(Msg); + OS << "PTX does not support \"atomic\" for orderings different than" + "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order is: \"" + << toIRString(Ordering) << "\"."; + report_fatal_error(OS.str()); + } + + // [3]: TODO: these should eventually use .mmio<.atomic sem>; for now we drop + // the volatile semantics and preserve the atomic ones. [4]: TODO: volatile + // atomics with order stronger than relaxed are currently unimplemented in + // sm_60 and older. + if (!hasAtomics && N->isVolatile() && + !(Ordering == AtomicOrdering::NotAtomic || + Ordering == AtomicOrdering::Monotonic)) { + SmallString<256> Msg; + raw_svector_ostream OS(Msg); + OS << "PTX does not support \"volatile atomic\" for orderings different " + "than \"NotAtomic\" or \"Monotonic\" for sm_60 and older, but order " + "is: \"" + << toIRString(Ordering) << "\"."; + report_fatal_error(OS.str()); + } + + // PTX volatile and PTX atomics are not available for statespace that differ + // from .generic, .global, or .shared. The behavior of PTX volatile and PTX + // atomics is undefined if the generic address does not refer to a .global or + // .shared memory location. + bool addrGenericOrGlobalOrShared = + (CodeAddrSpace == NVPTX::PTXLdStInstCode::GENERIC || + CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL || + CodeAddrSpace == NVPTX::PTXLdStInstCode::SHARED); + bool useRelaxedMMIO = + hasRelaxedMMIO && CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL; + + switch (Ordering) { + case AtomicOrdering::NotAtomic: + return N->isVolatile() && addrGenericOrGlobalOrShared + ? NVPTX::PTXLdStInstCode::Volatile + : NVPTX::PTXLdStInstCode::NotAtomic; + case AtomicOrdering::Monotonic: + if (N->isVolatile()) + return useRelaxedMMIO ? NVPTX::PTXLdStInstCode::RelaxedMMIO + : addrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile + : NVPTX::PTXLdStInstCode::NotAtomic; + else + return hasAtomics ? NVPTX::PTXLdStInstCode::Relaxed + : addrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile + : NVPTX::PTXLdStInstCode::NotAtomic; + case AtomicOrdering::Acquire: + if (!N->readMem()) { + SmallString<256> Msg; + raw_svector_ostream OS(Msg); + OS << "PTX only supports Acquire Ordering on reads: " + << N->getOperationName(); + N->print(OS); + report_fatal_error(OS.str()); + } + return addrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Acquire + : NVPTX::PTXLdStInstCode::NotAtomic; + case AtomicOrdering::Release: + if (!N->writeMem()) { + SmallString<256> Msg; + raw_svector_ostream OS(Msg); + OS << "PTX only supports Release Ordering on writes: " + << N->getOperationName(); + N->print(OS); + report_fatal_error(OS.str()); + } + return addrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Release + : NVPTX::PTXLdStInstCode::NotAtomic; + case AtomicOrdering::AcquireRelease: { + SmallString<256> Msg; + raw_svector_ostream OS(Msg); + OS << "PTX only supports AcquireRelease Ordering on read-modify-write: " + << N->getOperationName(); + N->print(OS); + report_fatal_error(OS.str()); + } + case AtomicOrdering::SequentiallyConsistent: + case AtomicOrdering::Unordered: + default: { + // TODO: support AcquireRelease and SequentiallyConsistent + SmallString<256> Msg; + raw_svector_ostream OS(Msg); + OS << "NVPTX backend does not support AtomicOrdering \"" + << toIRString(Ordering) << "\" yet."; + report_fatal_error(OS.str()); + } + } + + report_fatal_error("unreachable"); +} + static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, unsigned CodeAddrSpace, MachineFunction *F) { // We use ldg (i.e. ld.global.nc) for invariant loads from the global address @@ -916,32 +1067,18 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { if (!LoadedVT.isSimple()) return false; - AtomicOrdering Ordering = LD->getSuccessOrdering(); - // In order to lower atomic loads with stronger guarantees we would need to - // use load.acquire or insert fences. However these features were only added - // with PTX ISA 6.0 / sm_70. - // TODO: Check if we can actually use the new instructions and implement them. - if (isStrongerThanMonotonic(Ordering)) - return false; - // Address Space Setting unsigned int CodeAddrSpace = getCodeAddrSpace(LD); if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) { return tryLDGLDU(N); } + // Memory Semantic Setting + unsigned int CodeMemorySem = getCodeMemorySemantic(LD, Subtarget); + unsigned int PointerSize = CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace()); - // Volatile Setting - // - .volatile is only available for .global and .shared - // - .volatile has the same memory synchronization semantics as .relaxed.sys - bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic; - if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && - CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && - CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) - isVolatile = false; - // Type Setting: fromType + fromTypeWidth // // Sign : ISD::SEXTLOAD @@ -982,9 +1119,13 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { NVPTX::LD_f32_avar, NVPTX::LD_f64_avar); if (!Opcode) return false; - SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl), - getI32Imm(vecType, dl), getI32Imm(fromType, dl), - getI32Imm(fromTypeWidth, dl), Addr, Chain }; + SDValue Ops[] = {getI32Imm(CodeMemorySem, dl), + getI32Imm(CodeAddrSpace, dl), + getI32Imm(vecType, dl), + getI32Imm(fromType, dl), + getI32Imm(fromTypeWidth, dl), + Addr, + Chain}; NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset) : SelectADDRsi(N1.getNode(), N1, Base, Offset)) { @@ -993,9 +1134,14 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { NVPTX::LD_f32_asi, NVPTX::LD_f64_asi); if (!Opcode) return false; - SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl), - getI32Imm(vecType, dl), getI32Imm(fromType, dl), - getI32Imm(fromTypeWidth, dl), Base, Offset, Chain }; + SDValue Ops[] = {getI32Imm(CodeMemorySem, dl), + getI32Imm(CodeAddrSpace, dl), + getI32Imm(vecType, dl), + getI32Imm(fromType, dl), + getI32Imm(fromTypeWidth, dl), + Base, + Offset, + Chain}; NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset) : SelectADDRri(N1.getNode(), N1, Base, Offset)) { @@ -1010,9 +1156,14 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { NVPTX::LD_f32_ari, NVPTX::LD_f64_ari); if (!Opcode) return false; - SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl), - getI32Imm(vecType, dl), getI32Imm(fromType, dl), - getI32Imm(fromTypeWidth, dl), Base, Offset, Chain }; + SDValue Ops[] = {getI32Imm(CodeMemorySem, dl), + getI32Imm(CodeAddrSpace, dl), + getI32Imm(vecType, dl), + getI32Imm(fromType, dl), + getI32Imm(fromTypeWidth, dl), + Base, + Offset, + Chain}; NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); } else { if (PointerSize == 64) @@ -1026,9 +1177,13 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { NVPTX::LD_f32_areg, NVPTX::LD_f64_areg); if (!Opcode) return false; - SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl), - getI32Imm(vecType, dl), getI32Imm(fromType, dl), - getI32Imm(fromTypeWidth, dl), N1, Chain }; + SDValue Ops[] = {getI32Imm(CodeMemorySem, dl), + getI32Imm(CodeAddrSpace, dl), + getI32Imm(vecType, dl), + getI32Imm(fromType, dl), + getI32Imm(fromTypeWidth, dl), + N1, + Chain}; NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); } @@ -1065,13 +1220,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { unsigned int PointerSize = CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace()); - // Volatile Setting - // - .volatile is only availalble for .global and .shared - bool IsVolatile = MemSD->isVolatile(); - if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && - CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && - CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) - IsVolatile = false; + // Memory Semantic Setting + unsigned int CodeMemorySem = getCodeMemorySemantic(MemSD, Subtarget); // Vector Setting MVT SimpleVT = LoadedVT.getSimpleVT(); @@ -1138,9 +1288,13 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (!Opcode) return false; - SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL), - getI32Imm(VecType, DL), getI32Imm(FromType, DL), - getI32Imm(FromTypeWidth, DL), Addr, Chain }; + SDValue Ops[] = {getI32Imm(CodeMemorySem, DL), + getI32Imm(CodeAddrSpace, DL), + getI32Imm(VecType, DL), + getI32Imm(FromType, DL), + getI32Imm(FromTypeWidth, DL), + Addr, + Chain}; LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); } else if (PointerSize == 64 ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset) @@ -1163,9 +1317,14 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (!Opcode) return false; - SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL), - getI32Imm(VecType, DL), getI32Imm(FromType, DL), - getI32Imm(FromTypeWidth, DL), Base, Offset, Chain }; + SDValue Ops[] = {getI32Imm(CodeMemorySem, DL), + getI32Imm(CodeAddrSpace, DL), + getI32Imm(VecType, DL), + getI32Imm(FromType, DL), + getI32Imm(FromTypeWidth, DL), + Base, + Offset, + Chain}; LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); } else if (PointerSize == 64 ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset) @@ -1208,9 +1367,14 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (!Opcode) return false; - SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL), - getI32Imm(VecType, DL), getI32Imm(FromType, DL), - getI32Imm(FromTypeWidth, DL), Base, Offset, Chain }; + SDValue Ops[] = {getI32Imm(CodeMemorySem, DL), + getI32Imm(CodeAddrSpace, DL), + getI32Imm(VecType, DL), + getI32Imm(FromType, DL), + getI32Imm(FromTypeWidth, DL), + Base, + Offset, + Chain}; LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); } else { @@ -1253,9 +1417,13 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (!Opcode) return false; - SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL), - getI32Imm(VecType, DL), getI32Imm(FromType, DL), - getI32Imm(FromTypeWidth, DL), Op1, Chain }; + SDValue Ops[] = {getI32Imm(CodeMemorySem, DL), + getI32Imm(CodeAddrSpace, DL), + getI32Imm(VecType, DL), + getI32Imm(FromType, DL), + getI32Imm(FromTypeWidth, DL), + Op1, + Chain}; LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); } @@ -1698,27 +1866,13 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { if (!StoreVT.isSimple()) return false; - AtomicOrdering Ordering = ST->getSuccessOrdering(); - // In order to lower atomic loads with stronger guarantees we would need to - // use store.release or insert fences. However these features were only added - // with PTX ISA 6.0 / sm_70. - // TODO: Check if we can actually use the new instructions and implement them. - if (isStrongerThanMonotonic(Ordering)) - return false; - // Address Space Setting unsigned int CodeAddrSpace = getCodeAddrSpace(ST); unsigned int PointerSize = CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace()); - // Volatile Setting - // - .volatile is only available for .global and .shared - // - .volatile has the same memory synchronization semantics as .relaxed.sys - bool isVolatile = ST->isVolatile() || Ordering == AtomicOrdering::Monotonic; - if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && - CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && - CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) - isVolatile = false; + // Memory Semantic Setting + unsigned int CodeMemorySem = getCodeMemorySemantic(ST, Subtarget); // Vector Setting MVT SimpleVT = StoreVT.getSimpleVT(); @@ -1755,7 +1909,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { if (!Opcode) return false; SDValue Ops[] = {Value, - getI32Imm(isVolatile, dl), + getI32Imm(CodeMemorySem, dl), getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), getI32Imm(toType, dl), @@ -1772,7 +1926,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { if (!Opcode) return false; SDValue Ops[] = {Value, - getI32Imm(isVolatile, dl), + getI32Imm(CodeMemorySem, dl), getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), getI32Imm(toType, dl), @@ -1797,7 +1951,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { return false; SDValue Ops[] = {Value, - getI32Imm(isVolatile, dl), + getI32Imm(CodeMemorySem, dl), getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), getI32Imm(toType, dl), @@ -1819,7 +1973,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { if (!Opcode) return false; SDValue Ops[] = {Value, - getI32Imm(isVolatile, dl), + getI32Imm(CodeMemorySem, dl), getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), getI32Imm(toType, dl), @@ -1858,13 +2012,8 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { unsigned int PointerSize = CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace()); - // Volatile Setting - // - .volatile is only availalble for .global and .shared - bool IsVolatile = MemSD->isVolatile(); - if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && - CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && - CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) - IsVolatile = false; + // Memory Semantic Setting + unsigned int CodeMemorySem = getCodeMemorySemantic(MemSD, Subtarget); // Type Setting: toType + toTypeWidth // - for integer type, always use 'u' @@ -1906,7 +2055,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { ToTypeWidth = 32; } - StOps.push_back(getI32Imm(IsVolatile, DL)); + StOps.push_back(getI32Imm(CodeMemorySem, DL)); StOps.push_back(getI32Imm(CodeAddrSpace, DL)); StOps.push_back(getI32Imm(VecType, DL)); StOps.push_back(getI32Imm(ToType, DL)); diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 827febe845a4c..f37822f764bed 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -2941,39 +2941,39 @@ foreach vt = [v2f16, v2bf16, v2i16, v4i8] in { multiclass LD { def _avar : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t$dst, [$addr];", []>; def _areg : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t$dst, [$addr];", []>; def _areg_64 : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t$dst, [$addr];", []>; def _ari : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t$dst, [$addr+$offset];", []>; def _ari_64 : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t$dst, [$addr+$offset];", []>; def _asi : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t$dst, [$addr+$offset];", []>; } @@ -2989,39 +2989,39 @@ let mayLoad=1, hasSideEffects=0 in { multiclass ST { def _avar : NVPTXInst< (outs), - (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, + (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, imem:$addr), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" " \t[$addr], $src;", []>; def _areg : NVPTXInst< (outs), - (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, + (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" " \t[$addr], $src;", []>; def _areg_64 : NVPTXInst< (outs), - (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, + (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" " \t[$addr], $src;", []>; def _ari : NVPTXInst< (outs), - (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, + (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" " \t[$addr+$offset], $src;", []>; def _ari_64 : NVPTXInst< (outs), - (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, + (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" " \t[$addr+$offset], $src;", []>; def _asi : NVPTXInst< (outs), - (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, + (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" " \t[$addr+$offset], $src;", []>; } @@ -3040,75 +3040,75 @@ let mayStore=1, hasSideEffects=0 in { multiclass LD_VEC { def _v2_avar : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr];", []>; def _v2_areg : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr];", []>; def _v2_areg_64 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr];", []>; def _v2_ari : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; def _v2_ari_64 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; def _v2_asi : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; def _v4_avar : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; def _v4_areg : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; def _v4_areg_64 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; def _v4_ari : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; def _v4_ari_64 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; def _v4_asi : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; } let mayLoad=1, hasSideEffects=0 in { @@ -3123,84 +3123,84 @@ let mayLoad=1, hasSideEffects=0 in { multiclass ST_VEC { def _v2_avar : NVPTXInst< (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, + (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2}};", []>; def _v2_areg : NVPTXInst< (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, + (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2}};", []>; def _v2_areg_64 : NVPTXInst< (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, + (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2}};", []>; def _v2_ari : NVPTXInst< (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, + (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr+$offset], {{$src1, $src2}};", []>; def _v2_ari_64 : NVPTXInst< (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, + (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr+$offset], {{$src1, $src2}};", []>; def _v2_asi : NVPTXInst< (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, + (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr+$offset], {{$src1, $src2}};", []>; def _v4_avar : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; def _v4_areg : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; def _v4_areg_64 : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; def _v4_ari : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; def _v4_ari_64 : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; def _v4_asi : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}" + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}" "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; } diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll new file mode 100644 index 0000000000000..2ae71bf1230e2 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll @@ -0,0 +1,951 @@ +; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify %} + +; CHECK-LABEL: generic_plain +define void @generic_plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { + ; CHECK: ld.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load i8, ptr %a + %a.add = add i8 %a.load, 1 + ; CHECK: st.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store i8 %a.add, ptr %a + + ; CHECK: ld.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load i16, ptr %b + %b.add = add i16 %b.load, 1 + ; CHECK: st.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store i16 %b.add, ptr %b + + ; CHECK: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load i32, ptr %c + %c.add = add i32 %c.load, 1 + ; CHECK: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store i32 %c.add, ptr %c + + ; CHECK: ld.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load i64, ptr %d + %d.add = add i64 %d.load, 1 + ; CHECK: st.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store i64 %d.add, ptr %d + + ; CHECK: ld.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load float, ptr %c + %e.add = fadd float %e.load, 1. + ; CHECK: st.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store float %e.add, ptr %c + + ; CHECK: ld.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load double, ptr %c + %f.add = fadd double %f.load, 1. + ; CHECK: st.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store double %f.add, ptr %c + + ret void +} + +; CHECK-LABEL: generic_volatile +define void @generic_volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { + ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load volatile i8, ptr %a + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store volatile i8 %a.add, ptr %a + + ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load volatile i16, ptr %b + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store volatile i16 %b.add, ptr %b + + ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load volatile i32, ptr %c + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile i32 %c.add, ptr %c + + ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load volatile i64, ptr %d + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store volatile i64 %d.add, ptr %d + + ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load volatile float, ptr %c + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store volatile float %e.add, ptr %c + + ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load volatile double, ptr %c + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store volatile double %f.add, ptr %c + + ret void +} + +; CHECK-LABEL: generic_monotonic +define void @generic_monotonic(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr %a monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr %a monotonic, align 1 + + ; CHECK: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr %b monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr %b monotonic, align 2 + + ; CHECK: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr %c monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr %c monotonic, align 4 + + ; CHECK: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr %d monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr %d monotonic, align 8 + + ; CHECK: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr %e monotonic, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr %e monotonic, align 4 + + ; CHECK: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr %e monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr %e monotonic, align 8 + + ret void +} + +; CHECK-LABEL: generic_acq_rel +define void @generic_acq_rel(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr %a acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr %a release, align 1 + + ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr %b acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr %b release, align 2 + + ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr %c acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr %c release, align 4 + + ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr %d acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr %d release, align 8 + + ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr %e acquire, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr %e release, align 4 + + ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr %e acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr %e release, align 8 + + ret void +} + +; CHECK-LABEL: generic_monotonic_volatile +define void @generic_monotonic_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr %a monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr %a monotonic, align 1 + + ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr %b monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr %b monotonic, align 2 + + ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr %c monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr %c monotonic, align 4 + + ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr %d monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr %d monotonic, align 8 + + ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr %e monotonic, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr %e monotonic, align 4 + + ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr %e monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr %e monotonic, align 8 + + ret void +} + +;; global statespace + +; CHECK-LABEL: global_plain +define void @global_plain(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr { + ; CHECK: ld.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load i8, ptr addrspace(1) %a + %a.add = add i8 %a.load, 1 + ; CHECK: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store i8 %a.add, ptr addrspace(1) %a + + ; CHECK: ld.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load i16, ptr addrspace(1) %b + %b.add = add i16 %b.load, 1 + ; CHECK: st.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store i16 %b.add, ptr addrspace(1) %b + + ; CHECK: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load i32, ptr addrspace(1) %c + %c.add = add i32 %c.load, 1 + ; CHECK: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store i32 %c.add, ptr addrspace(1) %c + + ; CHECK: ld.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load i64, ptr addrspace(1) %d + %d.add = add i64 %d.load, 1 + ; CHECK: st.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store i64 %d.add, ptr addrspace(1) %d + + ; CHECK: ld.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load float, ptr addrspace(1) %c + %e.add = fadd float %e.load, 1. + ; CHECK: st.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store float %e.add, ptr addrspace(1) %c + + ; CHECK: ld.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load double, ptr addrspace(1) %c + %f.add = fadd double %f.load, 1. + ; CHECK: st.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store double %f.add, ptr addrspace(1) %c + + ret void +} + +; CHECK-LABEL: global_volatile +define void @global_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr { + ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load volatile i8, ptr addrspace(1) %a + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store volatile i8 %a.add, ptr addrspace(1) %a + + ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load volatile i16, ptr addrspace(1) %b + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store volatile i16 %b.add, ptr addrspace(1) %b + + ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load volatile i32, ptr addrspace(1) %c + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile i32 %c.add, ptr addrspace(1) %c + + ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load volatile i64, ptr addrspace(1) %d + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store volatile i64 %d.add, ptr addrspace(1) %d + + ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load volatile float, ptr addrspace(1) %c + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store volatile float %e.add, ptr addrspace(1) %c + + ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load volatile double, ptr addrspace(1) %c + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store volatile double %f.add, ptr addrspace(1) %c + + ret void +} + +; CHECK-LABEL: global_monotonic +define void @global_monotonic(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(1) %a monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(1) %a monotonic, align 1 + + ; CHECK: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(1) %b monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(1) %b monotonic, align 2 + + ; CHECK: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(1) %c monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(1) %c monotonic, align 4 + + ; CHECK: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(1) %d monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(1) %d monotonic, align 8 + + ; CHECK: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(1) %e monotonic, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(1) %e monotonic, align 4 + + ; CHECK: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(1) %e monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(1) %e monotonic, align 8 + + ret void +} + +; CHECK-LABEL: global_monotonic_volatile +define void @global_monotonic_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(1) %a monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(1) %a monotonic, align 1 + + ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(1) %b monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(1) %b monotonic, align 2 + + ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(1) %c monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(1) %c monotonic, align 4 + + ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(1) %d monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(1) %d monotonic, align 8 + + ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(1) %e monotonic, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(1) %e monotonic, align 4 + + ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(1) %e monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(1) %e monotonic, align 8 + + ret void +} + +; CHECK-LABEL: global_acq_rel +define void @global_acq_rel(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(1) %a acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(1) %a release, align 1 + + ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(1) %b acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(1) %b release, align 2 + + ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(1) %c acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(1) %c release, align 4 + + ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(1) %d acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(1) %d release, align 8 + + ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(1) %e acquire, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(1) %e release, align 4 + + ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(1) %e acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(1) %e release, align 8 + + ret void +} + +; CHECK-LABEL: global_acq_rel_volatile +define void @global_acq_rel_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(1) %a acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(1) %a release, align 1 + + ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(1) %b acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(1) %b release, align 2 + + ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(1) %c acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(1) %c release, align 4 + + ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(1) %d acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(1) %d release, align 8 + + ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(1) %e acquire, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(1) %e release, align 4 + + ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(1) %e acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(1) %e release, align 8 + + ret void +} + +;; shared statespace + +; CHECK-LABEL: shared_plain +define void @shared_plain(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr { + ; CHECK: ld.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load i8, ptr addrspace(3) %a + %a.add = add i8 %a.load, 1 + ; CHECK: st.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store i8 %a.add, ptr addrspace(3) %a + + ; CHECK: ld.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load i16, ptr addrspace(3) %b + %b.add = add i16 %b.load, 1 + ; CHECK: st.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store i16 %b.add, ptr addrspace(3) %b + + ; CHECK: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load i32, ptr addrspace(3) %c + %c.add = add i32 %c.load, 1 + ; CHECK: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store i32 %c.add, ptr addrspace(3) %c + + ; CHECK: ld.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load i64, ptr addrspace(3) %d + %d.add = add i64 %d.load, 1 + ; CHECK: st.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store i64 %d.add, ptr addrspace(3) %d + + ; CHECK: ld.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load float, ptr addrspace(3) %c + %e.add = fadd float %e.load, 1. + ; CHECK: st.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store float %e.add, ptr addrspace(3) %c + + ; CHECK: ld.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load double, ptr addrspace(3) %c + %f.add = fadd double %f.load, 1. + ; CHECK: st.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store double %f.add, ptr addrspace(3) %c + + ret void +} + +; CHECK-LABEL: shared_volatile +define void @shared_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr { + ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load volatile i8, ptr addrspace(3) %a + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store volatile i8 %a.add, ptr addrspace(3) %a + + ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load volatile i16, ptr addrspace(3) %b + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store volatile i16 %b.add, ptr addrspace(3) %b + + ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load volatile i32, ptr addrspace(3) %c + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile i32 %c.add, ptr addrspace(3) %c + + ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load volatile i64, ptr addrspace(3) %d + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store volatile i64 %d.add, ptr addrspace(3) %d + + ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load volatile float, ptr addrspace(3) %c + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store volatile float %e.add, ptr addrspace(3) %c + + ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load volatile double, ptr addrspace(3) %c + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store volatile double %f.add, ptr addrspace(3) %c + + ret void +} + +; CHECK-LABEL: shared_monotonic +define void @shared_monotonic(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(3) %a monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(3) %a monotonic, align 1 + + ; CHECK: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(3) %b monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(3) %b monotonic, align 2 + + ; CHECK: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(3) %c monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(3) %c monotonic, align 4 + + ; CHECK: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(3) %d monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(3) %d monotonic, align 8 + + ; CHECK: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(3) %e monotonic, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(3) %e monotonic, align 4 + + ; CHECK: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(3) %e monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(3) %e monotonic, align 8 + + ret void +} + +; CHECK-LABEL: shared_monotonic_volatile +define void @shared_monotonic_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(3) %a monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(3) %a monotonic, align 1 + + ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(3) %b monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(3) %b monotonic, align 2 + + ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(3) %c monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(3) %c monotonic, align 4 + + ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(3) %d monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(3) %d monotonic, align 8 + + ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(3) %e monotonic, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(3) %e monotonic, align 4 + + ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(3) %e monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(3) %e monotonic, align 8 + + ret void +} + +; CHECK-LABEL: shared_acq_rel +define void @shared_acq_rel(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(3) %a acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(3) %a release, align 1 + + ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(3) %b acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(3) %b release, align 2 + + ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(3) %c acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(3) %c release, align 4 + + ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(3) %d acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(3) %d release, align 8 + + ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(3) %e acquire, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(3) %e release, align 4 + + ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(3) %e acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(3) %e release, align 8 + + ret void +} + +; CHECK-LABEL: shared_acq_rel_volatile +define void @shared_acq_rel_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(3) %a acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(3) %a release, align 1 + + ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(3) %b acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(3) %b release, align 2 + + ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(3) %c acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(3) %c release, align 4 + + ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(3) %d acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(3) %d release, align 8 + + ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(3) %e acquire, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(3) %e release, align 4 + + ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(3) %e acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(3) %e release, align 8 + + ret void +} + +;; local statespace + +; CHECK-LABEL: local_plain +define void @local_plain(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load i8, ptr addrspace(5) %a + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store i8 %a.add, ptr addrspace(5) %a + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load i16, ptr addrspace(5) %b + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store i16 %b.add, ptr addrspace(5) %b + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load i32, ptr addrspace(5) %c + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store i32 %c.add, ptr addrspace(5) %c + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load i64, ptr addrspace(5) %d + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store i64 %d.add, ptr addrspace(5) %d + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load float, ptr addrspace(5) %c + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store float %e.add, ptr addrspace(5) %c + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load double, ptr addrspace(5) %c + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store double %f.add, ptr addrspace(5) %c + + ret void +} + +; CHECK-LABEL: local_volatile +define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load volatile i8, ptr addrspace(5) %a + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store volatile i8 %a.add, ptr addrspace(5) %a + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load volatile i16, ptr addrspace(5) %b + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store volatile i16 %b.add, ptr addrspace(5) %b + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load volatile i32, ptr addrspace(5) %c + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile i32 %c.add, ptr addrspace(5) %c + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load volatile i64, ptr addrspace(5) %d + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store volatile i64 %d.add, ptr addrspace(5) %d + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load volatile float, ptr addrspace(5) %c + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store volatile float %e.add, ptr addrspace(5) %c + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load volatile double, ptr addrspace(5) %c + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store volatile double %f.add, ptr addrspace(5) %c + + ret void +} + +; CHECK-LABEL: local_monotonic +define void @local_monotonic(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(5) %a monotonic, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(5) %b monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(5) %b monotonic, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(5) %c monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(5) %c monotonic, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(5) %d monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(5) %d monotonic, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(5) %e monotonic, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(5) %e monotonic, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(5) %e monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(5) %e monotonic, align 8 + + ret void +} + +; CHECK-LABEL: local_monotonic_volatile +define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(5) %a monotonic, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(5) %b monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(5) %b monotonic, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(5) %c monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(5) %c monotonic, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(5) %d monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(5) %d monotonic, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(5) %e monotonic, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(5) %e monotonic, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(5) %e monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(5) %e monotonic, align 8 + + ret void +} + +; CHECK-LABEL: local_acq_rel +define void @local_acq_rel(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(5) %a acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(5) %a release, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(5) %b acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(5) %b release, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(5) %c acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(5) %c release, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(5) %d acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(5) %d release, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(5) %e acquire, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(5) %e release, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(5) %e acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(5) %e release, align 8 + + ret void +} + +; CHECK-LABEL: local_acq_rel_volatile +define void @local_acq_rel_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(5) %a acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(5) %a release, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(5) %b acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(5) %b release, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(5) %c acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(5) %c release, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(5) %d acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(5) %d release, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(5) %e acquire, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(5) %e release, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(5) %e acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(5) %e release, align 8 + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/load-store.ll b/llvm/test/CodeGen/NVPTX/load-store.ll index c477bd9e744cd..27065f5eca9f4 100644 --- a/llvm/test/CodeGen/NVPTX/load-store.ll +++ b/llvm/test/CodeGen/NVPTX/load-store.ll @@ -1,8 +1,10 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} -; CHECK-LABEL: plain -define void @plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { +; generic statespace + +; CHECK-LABEL: generic_plain +define void @generic_plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { ; CHECK: ld.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load i8, ptr %a %a.add = add i8 %a.load, 1 @@ -42,8 +44,8 @@ define void @plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { ret void } -; CHECK-LABEL: volatile -define void @volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { +; CHECK-LABEL: generic_volatile +define void @generic_volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load volatile i8, ptr %a %a.add = add i8 %a.load, 1 @@ -83,8 +85,8 @@ define void @volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { ret void } -; CHECK-LABEL: monotonic -define void @monotonic(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { +; CHECK-LABEL: generic_monotonic +define void @generic_monotonic(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr %a monotonic, align 1 %a.add = add i8 %a.load, 1 @@ -123,3 +125,542 @@ define void @monotonic(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_add ret void } + +; CHECK-LABEL: generic_monotonic_volatile +define void @generic_monotonic_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr %a monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr %a monotonic, align 1 + + ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr %b monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr %b monotonic, align 2 + + ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr %c monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr %c monotonic, align 4 + + ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr %d monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr %d monotonic, align 8 + + ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr %e monotonic, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr %e monotonic, align 4 + + ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr %e monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr %e monotonic, align 8 + + ret void +} + +;; global statespace + +; CHECK-LABEL: global_plain +define void @global_plain(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr { + ; CHECK: ld.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load i8, ptr addrspace(1) %a + %a.add = add i8 %a.load, 1 + ; CHECK: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store i8 %a.add, ptr addrspace(1) %a + + ; CHECK: ld.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load i16, ptr addrspace(1) %b + %b.add = add i16 %b.load, 1 + ; CHECK: st.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store i16 %b.add, ptr addrspace(1) %b + + ; CHECK: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load i32, ptr addrspace(1) %c + %c.add = add i32 %c.load, 1 + ; CHECK: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store i32 %c.add, ptr addrspace(1) %c + + ; CHECK: ld.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load i64, ptr addrspace(1) %d + %d.add = add i64 %d.load, 1 + ; CHECK: st.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store i64 %d.add, ptr addrspace(1) %d + + ; CHECK: ld.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load float, ptr addrspace(1) %c + %e.add = fadd float %e.load, 1. + ; CHECK: st.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store float %e.add, ptr addrspace(1) %c + + ; CHECK: ld.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load double, ptr addrspace(1) %c + %f.add = fadd double %f.load, 1. + ; CHECK: st.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store double %f.add, ptr addrspace(1) %c + + ret void +} + +; CHECK-LABEL: global_volatile +define void @global_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr { + ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load volatile i8, ptr addrspace(1) %a + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store volatile i8 %a.add, ptr addrspace(1) %a + + ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load volatile i16, ptr addrspace(1) %b + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store volatile i16 %b.add, ptr addrspace(1) %b + + ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load volatile i32, ptr addrspace(1) %c + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile i32 %c.add, ptr addrspace(1) %c + + ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load volatile i64, ptr addrspace(1) %d + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store volatile i64 %d.add, ptr addrspace(1) %d + + ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load volatile float, ptr addrspace(1) %c + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store volatile float %e.add, ptr addrspace(1) %c + + ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load volatile double, ptr addrspace(1) %c + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store volatile double %f.add, ptr addrspace(1) %c + + ret void +} + +; CHECK-LABEL: global_monotonic +define void @global_monotonic(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(1) %a monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(1) %a monotonic, align 1 + + ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(1) %b monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(1) %b monotonic, align 2 + + ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(1) %c monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(1) %c monotonic, align 4 + + ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(1) %d monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(1) %d monotonic, align 8 + + ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(1) %e monotonic, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(1) %e monotonic, align 4 + + ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(1) %e monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(1) %e monotonic, align 8 + + ret void +} + +; CHECK-LABEL: global_monotonic_volatile +define void @global_monotonic_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(1) %a monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(1) %a monotonic, align 1 + + ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(1) %b monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(1) %b monotonic, align 2 + + ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(1) %c monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(1) %c monotonic, align 4 + + ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(1) %d monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(1) %d monotonic, align 8 + + ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(1) %e monotonic, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(1) %e monotonic, align 4 + + ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(1) %e monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(1) %e monotonic, align 8 + + ret void +} + +;; shared statespace + +; CHECK-LABEL: shared_plain +define void @shared_plain(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr { + ; CHECK: ld.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load i8, ptr addrspace(3) %a + %a.add = add i8 %a.load, 1 + ; CHECK: st.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store i8 %a.add, ptr addrspace(3) %a + + ; CHECK: ld.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load i16, ptr addrspace(3) %b + %b.add = add i16 %b.load, 1 + ; CHECK: st.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store i16 %b.add, ptr addrspace(3) %b + + ; CHECK: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load i32, ptr addrspace(3) %c + %c.add = add i32 %c.load, 1 + ; CHECK: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store i32 %c.add, ptr addrspace(3) %c + + ; CHECK: ld.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load i64, ptr addrspace(3) %d + %d.add = add i64 %d.load, 1 + ; CHECK: st.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store i64 %d.add, ptr addrspace(3) %d + + ; CHECK: ld.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load float, ptr addrspace(3) %c + %e.add = fadd float %e.load, 1. + ; CHECK: st.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store float %e.add, ptr addrspace(3) %c + + ; CHECK: ld.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load double, ptr addrspace(3) %c + %f.add = fadd double %f.load, 1. + ; CHECK: st.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store double %f.add, ptr addrspace(3) %c + + ret void +} + +; CHECK-LABEL: shared_volatile +define void @shared_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr { + ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load volatile i8, ptr addrspace(3) %a + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store volatile i8 %a.add, ptr addrspace(3) %a + + ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load volatile i16, ptr addrspace(3) %b + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store volatile i16 %b.add, ptr addrspace(3) %b + + ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load volatile i32, ptr addrspace(3) %c + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile i32 %c.add, ptr addrspace(3) %c + + ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load volatile i64, ptr addrspace(3) %d + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store volatile i64 %d.add, ptr addrspace(3) %d + + ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load volatile float, ptr addrspace(3) %c + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store volatile float %e.add, ptr addrspace(3) %c + + ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load volatile double, ptr addrspace(3) %c + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store volatile double %f.add, ptr addrspace(3) %c + + ret void +} + +; CHECK-LABEL: shared_monotonic +define void @shared_monotonic(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(3) %a monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(3) %a monotonic, align 1 + + ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(3) %b monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(3) %b monotonic, align 2 + + ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(3) %c monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(3) %c monotonic, align 4 + + ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(3) %d monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(3) %d monotonic, align 8 + + ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(3) %e monotonic, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(3) %e monotonic, align 4 + + ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(3) %e monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(3) %e monotonic, align 8 + + ret void +} + +; CHECK-LABEL: shared_monotonic_volatile +define void @shared_monotonic_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(3) %a monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(3) %a monotonic, align 1 + + ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(3) %b monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(3) %b monotonic, align 2 + + ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(3) %c monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(3) %c monotonic, align 4 + + ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(3) %d monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(3) %d monotonic, align 8 + + ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(3) %e monotonic, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(3) %e monotonic, align 4 + + ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(3) %e monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(3) %e monotonic, align 8 + + ret void +} + +;; local statespace + +; CHECK-LABEL: local_plain +define void @local_plain(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load i8, ptr addrspace(5) %a + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store i8 %a.add, ptr addrspace(5) %a + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load i16, ptr addrspace(5) %b + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store i16 %b.add, ptr addrspace(5) %b + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load i32, ptr addrspace(5) %c + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store i32 %c.add, ptr addrspace(5) %c + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load i64, ptr addrspace(5) %d + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store i64 %d.add, ptr addrspace(5) %d + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load float, ptr addrspace(5) %c + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store float %e.add, ptr addrspace(5) %c + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load double, ptr addrspace(5) %c + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store double %f.add, ptr addrspace(5) %c + + ret void +} + +; CHECK-LABEL: local_volatile +define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load volatile i8, ptr addrspace(5) %a + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store volatile i8 %a.add, ptr addrspace(5) %a + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load volatile i16, ptr addrspace(5) %b + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store volatile i16 %b.add, ptr addrspace(5) %b + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load volatile i32, ptr addrspace(5) %c + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile i32 %c.add, ptr addrspace(5) %c + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load volatile i64, ptr addrspace(5) %d + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store volatile i64 %d.add, ptr addrspace(5) %d + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load volatile float, ptr addrspace(5) %c + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store volatile float %e.add, ptr addrspace(5) %c + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load volatile double, ptr addrspace(5) %c + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store volatile double %f.add, ptr addrspace(5) %c + + ret void +} + +; CHECK-LABEL: local_monotonic +define void @local_monotonic(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(5) %a monotonic, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(5) %b monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(5) %b monotonic, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(5) %c monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(5) %c monotonic, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(5) %d monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(5) %d monotonic, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(5) %e monotonic, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(5) %e monotonic, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(5) %e monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(5) %e monotonic, align 8 + + ret void +} + +; CHECK-LABEL: local_monotonic_volatile +define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(5) %a monotonic, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(5) %b monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(5) %b monotonic, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(5) %c monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(5) %c monotonic, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(5) %d monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(5) %d monotonic, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(5) %e monotonic, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(5) %e monotonic, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(5) %e monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(5) %e monotonic, align 8 + + ret void +} From 5584b35c2af38bd3bfa6960d1dca4569ef94e1f0 Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Thu, 11 Jul 2024 19:10:57 +0200 Subject: [PATCH 2/7] [NVPTX] Move atomics and MMIO detection to NVPTXSubtarget --- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 16 ++++++---------- llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 7 ++++++- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 41c2da89f116d..114a875f6d31e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -719,12 +719,8 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N, AtomicOrdering Ordering = N->getSuccessOrdering(); auto CodeAddrSpace = getCodeAddrSpace(N); - // Supports relaxed, acquire, release, weak: - bool hasAtomics = - Subtarget->getPTXVersion() >= 60 && Subtarget->getSmVersion() >= 70; - // Supports mmio: - bool hasRelaxedMMIO = - Subtarget->getPTXVersion() >= 82 && Subtarget->getSmVersion() >= 70; + bool HasMemoryOrdering = Subtarget->hasMemoryOrdering(); + bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO(); // TODO: lowering for SequentiallyConsistent Operations: for now, we error. // TODO: lowering for AcquireRelease Operations: for now, we error. @@ -770,7 +766,7 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N, // sm_60 and older. if (!(Ordering == AtomicOrdering::NotAtomic || Ordering == AtomicOrdering::Monotonic) && - !hasAtomics) { + !HasMemoryOrdering) { SmallString<256> Msg; raw_svector_ostream OS(Msg); OS << "PTX does not support \"atomic\" for orderings different than" @@ -783,7 +779,7 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N, // the volatile semantics and preserve the atomic ones. [4]: TODO: volatile // atomics with order stronger than relaxed are currently unimplemented in // sm_60 and older. - if (!hasAtomics && N->isVolatile() && + if (!HasMemoryOrdering && N->isVolatile() && !(Ordering == AtomicOrdering::NotAtomic || Ordering == AtomicOrdering::Monotonic)) { SmallString<256> Msg; @@ -804,7 +800,7 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N, CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL || CodeAddrSpace == NVPTX::PTXLdStInstCode::SHARED); bool useRelaxedMMIO = - hasRelaxedMMIO && CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL; + HasRelaxedMMIO && CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL; switch (Ordering) { case AtomicOrdering::NotAtomic: @@ -817,7 +813,7 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N, : addrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile : NVPTX::PTXLdStInstCode::NotAtomic; else - return hasAtomics ? NVPTX::PTXLdStInstCode::Relaxed + return HasMemoryOrdering ? NVPTX::PTXLdStInstCode::Relaxed : addrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile : NVPTX::PTXLdStInstCode::NotAtomic; case AtomicOrdering::Acquire: diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 3ca4c1a24c79a..8df41913ff12e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -78,13 +78,18 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { bool hasAtomBitwise64() const { return SmVersion >= 32; } bool hasAtomMinMax64() const { return SmVersion >= 32; } bool hasLDG() const { return SmVersion >= 32; } - inline bool hasHWROT32() const { return SmVersion >= 32; } + bool hasHWROT32() const { return SmVersion >= 32; } bool hasImageHandles() const; bool hasFP16Math() const { return SmVersion >= 53; } bool hasBF16Math() const { return SmVersion >= 80; } bool allowFP16Math() const; bool hasMaskOperator() const { return PTXVersion >= 71; } bool hasNoReturn() const { return SmVersion >= 30 && PTXVersion >= 64; } + // Does SM & PTX support memory orderings (weak and atomic: relaxed, acquire, + // release, acq_rel, sc) ? + bool hasMemoryOrdering() const { return SmVersion >= 70 && PTXVersion >= 60; } + // Does SM & PTX support atomic relaxed MMIO operations ? + bool hasRelaxedMMIO() const { return SmVersion >= 70 && PTXVersion >= 82; } unsigned int getFullSmVersion() const { return FullSmVersion; } unsigned int getSmVersion() const { return getFullSmVersion() / 10; } // GPUs with "a" suffix have include architecture-accelerated features that From 069c69466fedd2262c2e2f1dbd5e722ccf03d424 Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Thu, 11 Jul 2024 18:04:03 +0200 Subject: [PATCH 3/7] [NVPTX]: Remove redundant check and fix capitalization --- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 35 +++++++-------------- 1 file changed, 11 insertions(+), 24 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 114a875f6d31e..060412072880b 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -741,7 +741,7 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N, // | Relaxed | Yes | Generic / Shared [0] | .volatile | .volatile | // | Relaxed | Yes | Global [0] | .volatile | .mmio.relaxed.sys (PTX 8.2+) or .volatile (PTX 8.1-) | // | Relaxed | Yes | Local / Const / Param | plain [1] | .weak [1] | - // | Other | Yes | Generic / Shared / Global [0] | Error [4] | [3] | + // | Other | Yes | Generic / Shared / Global [0] | Error [2] | [3] | // clang-format on @@ -763,7 +763,7 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N, } // [2]: Atomics with Ordering different than Relaxed are not supported on - // sm_60 and older. + // sm_60 and older; this includes volatile atomics. if (!(Ordering == AtomicOrdering::NotAtomic || Ordering == AtomicOrdering::Monotonic) && !HasMemoryOrdering) { @@ -776,45 +776,32 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N, } // [3]: TODO: these should eventually use .mmio<.atomic sem>; for now we drop - // the volatile semantics and preserve the atomic ones. [4]: TODO: volatile - // atomics with order stronger than relaxed are currently unimplemented in - // sm_60 and older. - if (!HasMemoryOrdering && N->isVolatile() && - !(Ordering == AtomicOrdering::NotAtomic || - Ordering == AtomicOrdering::Monotonic)) { - SmallString<256> Msg; - raw_svector_ostream OS(Msg); - OS << "PTX does not support \"volatile atomic\" for orderings different " - "than \"NotAtomic\" or \"Monotonic\" for sm_60 and older, but order " - "is: \"" - << toIRString(Ordering) << "\"."; - report_fatal_error(OS.str()); - } + // the volatile semantics and preserve the atomic ones. // PTX volatile and PTX atomics are not available for statespace that differ // from .generic, .global, or .shared. The behavior of PTX volatile and PTX // atomics is undefined if the generic address does not refer to a .global or // .shared memory location. - bool addrGenericOrGlobalOrShared = + bool AddrGenericOrGlobalOrShared = (CodeAddrSpace == NVPTX::PTXLdStInstCode::GENERIC || CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL || CodeAddrSpace == NVPTX::PTXLdStInstCode::SHARED); - bool useRelaxedMMIO = + bool UseRelaxedMMIO = HasRelaxedMMIO && CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL; switch (Ordering) { case AtomicOrdering::NotAtomic: - return N->isVolatile() && addrGenericOrGlobalOrShared + return N->isVolatile() && AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile : NVPTX::PTXLdStInstCode::NotAtomic; case AtomicOrdering::Monotonic: if (N->isVolatile()) - return useRelaxedMMIO ? NVPTX::PTXLdStInstCode::RelaxedMMIO - : addrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile + return UseRelaxedMMIO ? NVPTX::PTXLdStInstCode::RelaxedMMIO + : AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile : NVPTX::PTXLdStInstCode::NotAtomic; else return HasMemoryOrdering ? NVPTX::PTXLdStInstCode::Relaxed - : addrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile + : AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile : NVPTX::PTXLdStInstCode::NotAtomic; case AtomicOrdering::Acquire: if (!N->readMem()) { @@ -825,7 +812,7 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N, N->print(OS); report_fatal_error(OS.str()); } - return addrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Acquire + return AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Acquire : NVPTX::PTXLdStInstCode::NotAtomic; case AtomicOrdering::Release: if (!N->writeMem()) { @@ -836,7 +823,7 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N, N->print(OS); report_fatal_error(OS.str()); } - return addrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Release + return AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Release : NVPTX::PTXLdStInstCode::NotAtomic; case AtomicOrdering::AcquireRelease: { SmallString<256> Msg; From d6d7c19b70f18ba24bb9313b8c3d163e54261ab1 Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Fri, 12 Jul 2024 00:58:03 +0200 Subject: [PATCH 4/7] [NVPTX] Use ptxas-12.2 for new tests --- llvm/test/CodeGen/NVPTX/load-store-sm-70.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll index 2ae71bf1230e2..4df1b0e78261a 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify %} +; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify %} ; CHECK-LABEL: generic_plain define void @generic_plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { From 08a16c1b0cefd81538168206219f5779d263772f Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Fri, 12 Jul 2024 01:37:54 +0200 Subject: [PATCH 5/7] [NVPTX] Improve comments --- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 78 ++++++++++++++------- 1 file changed, 54 insertions(+), 24 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 060412072880b..99b7701186458 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -730,31 +730,63 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N, // Lowering for non-SequentiallyConsistent Operations // - // | Atomic | Volatile | Statespace | Lowering sm_60- | Lowering sm_70+ | - // |---------|----------|-------------------------------|-----------------|------------------------------------------------------| - // | No | No | All | plain | .weak | - // | No | Yes | Generic / Shared / Global [0] | .volatile | .volatile | - // | No | Yes | Local / Const / Param | plain [1] | .weak [1] | - // | Relaxed | No | Generic / Shared / Global [0] | .volatile | | - // | Other | No | Generic / Shared / Global [0] | Error [2] | | - // | Yes | No | Local / Const / Param | plain [1] | .weak [1] | - // | Relaxed | Yes | Generic / Shared [0] | .volatile | .volatile | - // | Relaxed | Yes | Global [0] | .volatile | .mmio.relaxed.sys (PTX 8.2+) or .volatile (PTX 8.1-) | - // | Relaxed | Yes | Local / Const / Param | plain [1] | .weak [1] | - // | Other | Yes | Generic / Shared / Global [0] | Error [2] | [3] | + // | Atomic | Volatile | Statespace | PTX sm_60- | PTX sm_70+ | + // |---------|----------|--------------------|------------|------------------------------| + // | No | No | All | plain | .weak | + // | No | Yes | Generic,Shared, | .volatile | .volatile | + // | | | Global [0] | | | + // | No | Yes | Local,Const,Param | plain [1] | .weak [1] | + // | Relaxed | No | Generic,Shared, | | | + // | | | Global [0] | .volatile | | + // | Other | No | Generic,Shared, | Error [2] | | + // | | | Global [0] | | | + // | Yes | No | Local,Const,Param | plain [1] | .weak [1] | + // | Relaxed | Yes | Generic,Shared [0] | .volatile | .volatile | + // | Relaxed | Yes | Global [0] | .volatile | .mmio.relaxed.sys (PTX 8.2+) | + // | | | | | or .volatile (PTX 8.1-) | + // | Relaxed | Yes | Local,Const,Param | plain [1] | .weak [1] | + // | Other | Yes | Generic, Shared, | Error [2] | [3] | + // | | | / Global [0] | | | // clang-format on - // [0]: volatile and atomics are only supported on generic addressing to - // shared or global, or shared, or global. - // MMIO requires generic addressing to global or global, but - // (TODO) we only implement it for global. + // [0]: volatile and atomics are only supported on global or shared + // memory locations, accessed via generic/shared/global pointers. + // MMIO is only supported on global memory locations, + // accessed via generic/global pointers. + // TODO: Implement MMIO access via generic pointer to global. + // Currently implemented for global pointers only. - // [1]: TODO: this implementation exhibits PTX Undefined Behavior; it - // fails to preserve the side-effects of atomics and volatile - // accesses in LLVM IR to local / const / param, causing - // well-formed LLVM-IR & CUDA C++ programs to be miscompiled - // in sm_70+. + // [1]: Lowering volatile/atomic operations to non-volatile/non-atomic + // PTX instructions fails to preserve their C++ side-effects. + // + // Example (https://github.com/llvm/llvm-project/issues/62057): + // + // void example() { + // std::atomic True = true; + // while (True.load(std::memory_order_relaxed)); + // } + // + // A C++ program that calls "example" is well-defined: the infinite loop + // performs an atomic operation. By lowering volatile/atomics to + // "weak" memory operations, we are transforming the above into: + // + // void undefined_behavior() { + // bool True = true; + // while (True); + // } + // + // which exhibits undefined behavior in both C++ and PTX. + // + // Calling "example" in CUDA C++ compiled for sm_60- exhibits undefined + // behavior due to lack of Independent Forward Progress. Lowering these + // to weak memory operations in sm_60- is therefore fine. + // + // TODO: lower atomic and volatile operatios to memory locations + // in local, const, and param to two PTX operations in sm_70+: + // - the "weak" memory operation we are currently lowering to, and + // - some other memory operation that preserves the side-effect, e.g., + // a dummy volatile load. if (CodeAddrSpace == NVPTX::PTXLdStInstCode::LOCAL || CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT || @@ -835,7 +867,6 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N, } case AtomicOrdering::SequentiallyConsistent: case AtomicOrdering::Unordered: - default: { // TODO: support AcquireRelease and SequentiallyConsistent SmallString<256> Msg; raw_svector_ostream OS(Msg); @@ -843,9 +874,8 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N, << toIRString(Ordering) << "\" yet."; report_fatal_error(OS.str()); } - } - report_fatal_error("unreachable"); + llvm_unreachable("unexpected unhandled case"); } static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, From 54a98065c2cd4a49560baa62c57b41e3acfca2ae Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Fri, 12 Jul 2024 14:55:32 -0700 Subject: [PATCH 6/7] [NVPTX] Add missing -arch to ptxas test invocation --- llvm/test/CodeGen/NVPTX/load-store-sm-70.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll index 4df1b0e78261a..7cdced1778a53 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | FileCheck %s -; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify %} +; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify -arch=sm_70 %} ; CHECK-LABEL: generic_plain define void @generic_plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { From 46ab54a80a1a0508f092487f282f32cf473593ea Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Fri, 12 Jul 2024 14:56:18 -0700 Subject: [PATCH 7/7] [NVPTX] Fix comment typos --- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 99b7701186458..371ec8596ef63 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -782,11 +782,11 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N, // behavior due to lack of Independent Forward Progress. Lowering these // to weak memory operations in sm_60- is therefore fine. // - // TODO: lower atomic and volatile operatios to memory locations - // in local, const, and param to two PTX operations in sm_70+: - // - the "weak" memory operation we are currently lowering to, and - // - some other memory operation that preserves the side-effect, e.g., - // a dummy volatile load. + // TODO: lower atomic and volatile operations to memory locations + // in local, const, and param to two PTX instructions in sm_70+: + // - the "weak" memory instruction we are currently lowering to, and + // - some other instruction that preserves the side-effect, e.g., + // a dead dummy volatile load. if (CodeAddrSpace == NVPTX::PTXLdStInstCode::LOCAL || CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT ||