diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp index a004d64c21cc6..5b568b0487b45 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -13,12 +13,14 @@ #include "MCTargetDesc/NVPTXInstPrinter.h" #include "MCTargetDesc/NVPTXBaseInfo.h" #include "NVPTX.h" +#include "NVPTXUtilities.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormatVariadic.h" #include "llvm/Support/FormattedStream.h" #include using namespace llvm; @@ -228,31 +230,29 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum, const MCOperand &MO = MI->getOperand(OpNum); int Imm = (int) MO.getImm(); if (!strcmp(Modifier, "sem")) { - switch (Imm) { - case NVPTX::PTXLdStInstCode::NotAtomic: + auto Ordering = NVPTX::Ordering(Imm); + switch (Ordering) { + case NVPTX::Ordering::NotAtomic: break; - case NVPTX::PTXLdStInstCode::Volatile: + case NVPTX::Ordering::Volatile: O << ".volatile"; break; - case NVPTX::PTXLdStInstCode::Relaxed: + case NVPTX::Ordering::Relaxed: O << ".relaxed.sys"; break; - case NVPTX::PTXLdStInstCode::Acquire: + case NVPTX::Ordering::Acquire: O << ".acquire.sys"; break; - case NVPTX::PTXLdStInstCode::Release: + case NVPTX::Ordering::Release: O << ".release.sys"; break; - case NVPTX::PTXLdStInstCode::RelaxedMMIO: + case NVPTX::Ordering::RelaxedMMIO: O << ".mmio.relaxed.sys"; break; default: - SmallString<256> Msg; - raw_svector_ostream OS(Msg); - OS << "NVPTX LdStCode Printer does not support \"" << Imm - << "\" sem modifier."; - report_fatal_error(OS.str()); - break; + report_fatal_error(formatv( + "NVPTX LdStCode Printer does not support \"{}\" sem modifier.", + OrderingToCString(Ordering))); } } else if (!strcmp(Modifier, "addsp")) { switch (Imm) { diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index 3c7167b157025..f6f6acb9e13c9 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -16,6 +16,7 @@ #include "llvm/IR/PassManager.h" #include "llvm/Pass.h" +#include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/CodeGen.h" namespace llvm { @@ -106,15 +107,25 @@ enum LoadStore { isStoreShift = 6 }; -namespace PTXLdStInstCode { -enum MemorySemantic { - NotAtomic = 0, // PTX calls these: "Weak" - Volatile = 1, - Relaxed = 2, - Acquire = 3, - Release = 4, - RelaxedMMIO = 5 +// Extends LLVM AtomicOrdering with PTX Orderings: +using OrderingUnderlyingType = unsigned int; +enum Ordering : OrderingUnderlyingType { + NotAtomic = (OrderingUnderlyingType) + AtomicOrdering::NotAtomic, // PTX calls these: "Weak" + // Unordered = 1, // NVPTX maps LLVM Unorderd to Relaxed + Relaxed = (OrderingUnderlyingType)AtomicOrdering::Monotonic, + // Consume = 3, // Unimplemented in LLVM; NVPTX would map to "Acquire" + Acquire = (OrderingUnderlyingType)AtomicOrdering::Acquire, + Release = (OrderingUnderlyingType)AtomicOrdering::Release, + // AcquireRelease = 6, // TODO + SequentiallyConsistent = + (OrderingUnderlyingType)AtomicOrdering::SequentiallyConsistent, + Volatile = SequentiallyConsistent + 1, + RelaxedMMIO = Volatile + 1, + LAST = RelaxedMMIO }; + +namespace PTXLdStInstCode { enum AddressSpace { GENERIC = 0, GLOBAL = 1, @@ -134,7 +145,7 @@ enum VecType { V2 = 2, V4 = 4 }; -} +} // namespace PTXLdStInstCode /// PTXCvtMode - Conversion code enumeration namespace PTXCvtMode { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 96456ad0547ea..25c198f0121e5 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -22,6 +22,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormatVariadic.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetIntrinsicInfo.h" @@ -714,21 +715,28 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) { return NVPTX::PTXLdStInstCode::GENERIC; } -static unsigned int getCodeMemorySemantic(MemSDNode *N, - const NVPTXSubtarget *Subtarget) { +namespace { + +struct OperationOrderings { + NVPTX::Ordering InstructionOrdering, FenceOrdering; + OperationOrderings(NVPTX::Ordering IO = NVPTX::Ordering::NotAtomic, + NVPTX::Ordering FO = NVPTX::Ordering::NotAtomic) + : InstructionOrdering(IO), FenceOrdering(FO) {} +}; + +static OperationOrderings +getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) { AtomicOrdering Ordering = N->getSuccessOrdering(); auto CodeAddrSpace = getCodeAddrSpace(N); bool HasMemoryOrdering = Subtarget->hasMemoryOrdering(); bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO(); - // TODO: lowering for SequentiallyConsistent Operations: for now, we error. - // TODO: lowering for AcquireRelease Operations: for now, we error. - // - // clang-format off - // Lowering for non-SequentiallyConsistent Operations + // Lowering for Load/Store Operations (note: AcquireRelease Loads or Stores error). + // Note: uses of Relaxed in the Atomic column of this table refer + // to LLVM AtomicOrdering::Monotonic. // // | Atomic | Volatile | Statespace | PTX sm_60- | PTX sm_70+ | // |---------|----------|--------------------|------------|------------------------------| @@ -749,6 +757,25 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N, // | Other | Yes | Generic, Shared, | Error [2] | [3] | // | | | / Global [0] | | | + // Lowering of CUDA C++ SequentiallyConsistent Operations and Fences to PTX + // by following the ABI proven sound in: + // Lustig et al, A Formal Analysis of the NVIDIA PTX Memory Consistency Model, ASPLOS’19. + // https://dl.acm.org/doi/pdf/10.1145/3297858.3304043 + // + // | CUDA C++ Atomic Operation or Atomic Fence | PTX Atomic Operation or Fence | + // |------------------------------------------------------|-------------------------------| + // | cuda::atomic_thread_fence | fence.sc.; | + // | (memory_order_seq_cst, cuda::thread_scope_) | | + // |------------------------------------------------------|-------------------------------| + // | cuda::atomic_load | fence.sc.; | + // | (memory_order_seq_cst, cuda::thread_scope_) | ld.acquire.; | + // |------------------------------------------------------|-------------------------------| + // | cuda::atomic_store | fence.sc.; | + // | (memory_order_seq_cst, cuda::thread_scope_) | st.release.; | + // |------------------------------------------------------|-------------------------------| + // | cuda::atomic_fetch_ | fence.sc.; | + // | (memory_order_seq_cst, cuda::thread_scope_) | atom.acq_rel.; | + // clang-format on // [0]: volatile and atomics are only supported on global or shared @@ -788,11 +815,10 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N, // - the "weak" memory instruction we are currently lowering to, and // - some other instruction that preserves the side-effect, e.g., // a dead dummy volatile load. - if (CodeAddrSpace == NVPTX::PTXLdStInstCode::LOCAL || CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT || CodeAddrSpace == NVPTX::PTXLdStInstCode::PARAM) { - return NVPTX::PTXLdStInstCode::NotAtomic; + return NVPTX::Ordering::NotAtomic; } // [2]: Atomics with Ordering different than Unordered or Relaxed are not @@ -801,12 +827,11 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N, Ordering == AtomicOrdering::Unordered || Ordering == AtomicOrdering::Monotonic) && !HasMemoryOrdering) { - SmallString<256> Msg; - raw_svector_ostream OS(Msg); - OS << "PTX does not support \"atomic\" for orderings different than" - "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order is: \"" - << toIRString(Ordering) << "\"."; - report_fatal_error(OS.str()); + report_fatal_error( + formatv("PTX does not support \"atomic\" for orderings different than" + "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order " + "is: \"{}\".", + toIRString(Ordering))); } // [3]: TODO: these should eventually use .mmio<.atomic sem>; for now we drop @@ -820,68 +845,76 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N, (CodeAddrSpace == NVPTX::PTXLdStInstCode::GENERIC || CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL || CodeAddrSpace == NVPTX::PTXLdStInstCode::SHARED); + if (!AddrGenericOrGlobalOrShared) + return NVPTX::Ordering::NotAtomic; + bool UseRelaxedMMIO = HasRelaxedMMIO && CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL; switch (Ordering) { case AtomicOrdering::NotAtomic: - return N->isVolatile() && AddrGenericOrGlobalOrShared - ? NVPTX::PTXLdStInstCode::Volatile - : NVPTX::PTXLdStInstCode::NotAtomic; + return N->isVolatile() ? NVPTX::Ordering::Volatile + : NVPTX::Ordering::NotAtomic; case AtomicOrdering::Unordered: // We lower unordered in the exact same way as 'monotonic' to respect // LLVM IR atomicity requirements. case AtomicOrdering::Monotonic: if (N->isVolatile()) - return UseRelaxedMMIO ? NVPTX::PTXLdStInstCode::RelaxedMMIO - : AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile - : NVPTX::PTXLdStInstCode::NotAtomic; + return UseRelaxedMMIO ? NVPTX::Ordering::RelaxedMMIO + : NVPTX::Ordering::Volatile; else - return HasMemoryOrdering ? NVPTX::PTXLdStInstCode::Relaxed - : AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile - : NVPTX::PTXLdStInstCode::NotAtomic; + return HasMemoryOrdering ? NVPTX::Ordering::Relaxed + : NVPTX::Ordering::Volatile; + // case AtomicOrdering::Consume: // If LLVM ever provides this, lower it to + // Acquire. case AtomicOrdering::Acquire: - if (!N->readMem()) { - SmallString<256> Msg; - raw_svector_ostream OS(Msg); - OS << "PTX only supports Acquire Ordering on reads: " - << N->getOperationName(); - N->print(OS); - report_fatal_error(OS.str()); - } - return AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Acquire - : NVPTX::PTXLdStInstCode::NotAtomic; + if (!N->readMem()) + report_fatal_error( + formatv("PTX only supports Acquire Ordering on reads: {}", + N->getOperationName())); + return NVPTX::Ordering::Acquire; case AtomicOrdering::Release: - if (!N->writeMem()) { - SmallString<256> Msg; - raw_svector_ostream OS(Msg); - OS << "PTX only supports Release Ordering on writes: " - << N->getOperationName(); - N->print(OS); - report_fatal_error(OS.str()); - } - return AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Release - : NVPTX::PTXLdStInstCode::NotAtomic; + if (!N->writeMem()) + report_fatal_error( + formatv("PTX only supports Release Ordering on writes: {}", + N->getOperationName())); + return NVPTX::Ordering::Release; case AtomicOrdering::AcquireRelease: { - SmallString<256> Msg; - raw_svector_ostream OS(Msg); - OS << "PTX only supports AcquireRelease Ordering on read-modify-write: " - << N->getOperationName(); - N->print(OS); - report_fatal_error(OS.str()); + report_fatal_error( + formatv("NVPTX does not support AcquireRelease Ordering on " + "read-modify-write " + "yet and PTX does not support it on loads or stores: {}", + N->getOperationName())); + } + case AtomicOrdering::SequentiallyConsistent: { + // LLVM-IR SequentiallyConsistent atomics map to a two-instruction PTX + // sequence including a "fence.sc.sco" and the memory instruction with an + // Ordering that differs from "sc": acq, rel, or acq_rel, depending on + // whether the memory operation is a read, write, or read-modify-write. + // + // This sets the ordering of the fence to SequentiallyConsistent, and + // sets the corresponding ordering for the instruction. + NVPTX::Ordering InstrOrder; + if (N->readMem()) + InstrOrder = NVPTX::Ordering::Acquire; + else if (N->writeMem()) + InstrOrder = NVPTX::Ordering::Release; + else + report_fatal_error( + formatv("NVPTX does not support SequentiallyConsistent Ordering on " + "read-modify-writes yet: {}", + N->getOperationName())); + return OperationOrderings(InstrOrder, + NVPTX::Ordering::SequentiallyConsistent); } - case AtomicOrdering::SequentiallyConsistent: - // TODO: support AcquireRelease and SequentiallyConsistent - SmallString<256> Msg; - raw_svector_ostream OS(Msg); - OS << "NVPTX backend does not support AtomicOrdering \"" - << toIRString(Ordering) << "\" yet."; - report_fatal_error(OS.str()); } - - llvm_unreachable("unexpected unhandled case"); + report_fatal_error( + formatv("NVPTX backend does not support AtomicOrdering \"{}\" yet.", + toIRString(Ordering))); } +} // namespace + static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, unsigned CodeAddrSpace, MachineFunction *F) { // We use ldg (i.e. ld.global.nc) for invariant loads from the global address @@ -924,6 +957,35 @@ static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, }); } +NVPTX::Ordering NVPTXDAGToDAGISel::insertMemoryInstructionFence(SDLoc DL, + SDValue &Chain, + MemSDNode *N) { + // Some memory instructions - loads, stores, atomics - need an extra fence + // instruction. Get the memory order of the instruction, and that of its + // fence, if any. + auto [InstructionOrdering, FenceOrdering] = + getOperationOrderings(N, Subtarget); + + // If a fence is required before the operation, insert it: + switch (NVPTX::Ordering(FenceOrdering)) { + case NVPTX::Ordering::NotAtomic: + break; + case NVPTX::Ordering::SequentiallyConsistent: { + unsigned Op = Subtarget->hasMemoryOrdering() + ? NVPTX::atomic_thread_fence_seq_cst_sys + : NVPTX::INT_MEMBAR_SYS; + Chain = SDValue(CurDAG->getMachineNode(Op, DL, MVT::Other, Chain), 0); + break; + } + default: + report_fatal_error( + formatv("Unexpected fence ordering: \"{}\".", + OrderingToCString(NVPTX::Ordering(FenceOrdering)))); + } + + return InstructionOrdering; +} + bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) { unsigned IID = N->getConstantOperandVal(0); switch (IID) { @@ -1070,17 +1132,15 @@ static int getLdStRegType(EVT VT) { } bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { - SDLoc dl(N); MemSDNode *LD = cast(N); assert(LD->readMem() && "Expected load"); - LoadSDNode *PlainLoad = dyn_cast(N); - EVT LoadedVT = LD->getMemoryVT(); - SDNode *NVPTXLD = nullptr; // do not support pre/post inc/dec + LoadSDNode *PlainLoad = dyn_cast(N); if (PlainLoad && PlainLoad->isIndexed()) return false; + EVT LoadedVT = LD->getMemoryVT(); if (!LoadedVT.isSimple()) return false; @@ -1089,13 +1149,13 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) { return tryLDGLDU(N); } - - // Memory Semantic Setting - unsigned int CodeMemorySem = getCodeMemorySemantic(LD, Subtarget); - unsigned int PointerSize = CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace()); + SDLoc DL(N); + SDValue Chain = N->getOperand(0); + auto InstructionOrdering = insertMemoryInstructionFence(DL, Chain, LD); + // Type Setting: fromType + fromTypeWidth // // Sign : ISD::SEXTLOAD @@ -1105,45 +1165,42 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { MVT SimpleVT = LoadedVT.getSimpleVT(); MVT ScalarVT = SimpleVT.getScalarType(); // Read at least 8 bits (predicates are stored as 8-bit values) - unsigned fromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits()); - unsigned int fromType; + unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits()); + unsigned int FromType; // Vector Setting - unsigned vecType = NVPTX::PTXLdStInstCode::Scalar; + unsigned VecType = NVPTX::PTXLdStInstCode::Scalar; if (SimpleVT.isVector()) { assert((Isv2x16VT(LoadedVT) || LoadedVT == MVT::v4i8) && "Unexpected vector type"); // v2f16/v2bf16/v2i16 is loaded using ld.b32 - fromTypeWidth = 32; + FromTypeWidth = 32; } if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD)) - fromType = NVPTX::PTXLdStInstCode::Signed; + FromType = NVPTX::PTXLdStInstCode::Signed; else - fromType = getLdStRegType(ScalarVT); + FromType = getLdStRegType(ScalarVT); // Create the machine instruction DAG - SDValue Chain = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue Addr; SDValue Offset, Base; std::optional Opcode; MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy; + SmallVector Ops({getI32Imm(InstructionOrdering, DL), + getI32Imm(CodeAddrSpace, DL), + getI32Imm(VecType, DL), getI32Imm(FromType, DL), + getI32Imm(FromTypeWidth, DL)}); + if (SelectDirectAddr(N1, Addr)) { Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar, NVPTX::LD_i32_avar, NVPTX::LD_i64_avar, NVPTX::LD_f32_avar, NVPTX::LD_f64_avar); if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(CodeMemorySem, dl), - getI32Imm(CodeAddrSpace, dl), - getI32Imm(vecType, dl), - getI32Imm(fromType, dl), - getI32Imm(fromTypeWidth, dl), - Addr, - Chain}; - NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); + Ops.append({Addr, Chain}); } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset) : SelectADDRsi(N1.getNode(), N1, Base, Offset)) { Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi, @@ -1151,15 +1208,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { NVPTX::LD_f32_asi, NVPTX::LD_f64_asi); if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(CodeMemorySem, dl), - getI32Imm(CodeAddrSpace, dl), - getI32Imm(vecType, dl), - getI32Imm(fromType, dl), - getI32Imm(fromTypeWidth, dl), - Base, - Offset, - Chain}; - NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); + Ops.append({Base, Offset, Chain}); } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset) : SelectADDRri(N1.getNode(), N1, Base, Offset)) { if (PointerSize == 64) @@ -1173,15 +1222,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { NVPTX::LD_f32_ari, NVPTX::LD_f64_ari); if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(CodeMemorySem, dl), - getI32Imm(CodeAddrSpace, dl), - getI32Imm(vecType, dl), - getI32Imm(fromType, dl), - getI32Imm(fromTypeWidth, dl), - Base, - Offset, - Chain}; - NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); + Ops.append({Base, Offset, Chain}); } else { if (PointerSize == 64) Opcode = @@ -1194,16 +1235,11 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { NVPTX::LD_f32_areg, NVPTX::LD_f64_areg); if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(CodeMemorySem, dl), - getI32Imm(CodeAddrSpace, dl), - getI32Imm(vecType, dl), - getI32Imm(fromType, dl), - getI32Imm(fromTypeWidth, dl), - N1, - Chain}; - NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); + Ops.append({N1, Chain}); } + SDNode *NVPTXLD = + CurDAG->getMachineNode(*Opcode, DL, TargetVT, MVT::Other, Ops); if (!NVPTXLD) return false; @@ -1215,16 +1251,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { } bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { - - SDValue Chain = N->getOperand(0); - SDValue Op1 = N->getOperand(1); - SDValue Addr, Offset, Base; - std::optional Opcode; - SDLoc DL(N); - SDNode *LD; MemSDNode *MemSD = cast(N); EVT LoadedVT = MemSD->getMemoryVT(); - if (!LoadedVT.isSimple()) return false; @@ -1233,12 +1261,12 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) { return tryLDGLDU(N); } - unsigned int PointerSize = CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace()); - // Memory Semantic Setting - unsigned int CodeMemorySem = getCodeMemorySemantic(MemSD, Subtarget); + SDLoc DL(N); + SDValue Chain = N->getOperand(0); + auto InstructionOrdering = insertMemoryInstructionFence(DL, Chain, MemSD); // Vector Setting MVT SimpleVT = LoadedVT.getSimpleVT(); @@ -1286,6 +1314,16 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { FromTypeWidth = 32; } + SDValue Op1 = N->getOperand(1); + SDValue Addr, Offset, Base; + std::optional Opcode; + SDNode *LD; + + SmallVector Ops({getI32Imm(InstructionOrdering, DL), + getI32Imm(CodeAddrSpace, DL), + getI32Imm(VecType, DL), getI32Imm(FromType, DL), + getI32Imm(FromTypeWidth, DL)}); + if (SelectDirectAddr(Op1, Addr)) { switch (N->getOpcode()) { default: @@ -1305,14 +1343,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(CodeMemorySem, DL), - getI32Imm(CodeAddrSpace, DL), - getI32Imm(VecType, DL), - getI32Imm(FromType, DL), - getI32Imm(FromTypeWidth, DL), - Addr, - Chain}; - LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); + Ops.append({Addr, Chain}); } else if (PointerSize == 64 ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset) : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) { @@ -1334,15 +1365,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(CodeMemorySem, DL), - getI32Imm(CodeAddrSpace, DL), - getI32Imm(VecType, DL), - getI32Imm(FromType, DL), - getI32Imm(FromTypeWidth, DL), - Base, - Offset, - Chain}; - LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); + Ops.append({Base, Offset, Chain}); } else if (PointerSize == 64 ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset) : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) { @@ -1384,16 +1407,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(CodeMemorySem, DL), - getI32Imm(CodeAddrSpace, DL), - getI32Imm(VecType, DL), - getI32Imm(FromType, DL), - getI32Imm(FromTypeWidth, DL), - Base, - Offset, - Chain}; - - LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); + Ops.append({Base, Offset, Chain}); } else { if (PointerSize == 64) { switch (N->getOpcode()) { @@ -1434,15 +1448,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(CodeMemorySem, DL), - getI32Imm(CodeAddrSpace, DL), - getI32Imm(VecType, DL), - getI32Imm(FromType, DL), - getI32Imm(FromTypeWidth, DL), - Op1, - Chain}; - LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); + Ops.append({Op1, Chain}); } + LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); MachineMemOperand *MemRef = cast(N)->getMemOperand(); CurDAG->setNodeMemRefs(cast(LD), {MemRef}); @@ -1452,8 +1460,6 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { - - SDValue Chain = N->getOperand(0); SDValue Op1; MemSDNode *Mem; bool IsLDG = true; @@ -1483,12 +1489,7 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { Mem = cast(N); } - std::optional Opcode; - SDLoc DL(N); - SDNode *LD; - SDValue Base, Offset, Addr; EVT OrigType = N->getValueType(0); - EVT EltVT = Mem->getMemoryVT(); unsigned NumElts = 1; if (EltVT.isVector()) { @@ -1517,6 +1518,12 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { } InstVTs.push_back(MVT::Other); SDVTList InstVTList = CurDAG->getVTList(InstVTs); + SDValue Chain = N->getOperand(0); + + std::optional Opcode; + SDLoc DL(N); + SDNode *LD; + SDValue Base, Offset, Addr; if (SelectDirectAddr(Op1, Addr)) { switch (N->getOpcode()) { @@ -1867,19 +1874,17 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { } bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { - SDLoc dl(N); MemSDNode *ST = cast(N); assert(ST->writeMem() && "Expected store"); StoreSDNode *PlainStore = dyn_cast(N); AtomicSDNode *AtomicStore = dyn_cast(N); assert((PlainStore || AtomicStore) && "Expected store"); - EVT StoreVT = ST->getMemoryVT(); - SDNode *NVPTXST = nullptr; // do not support pre/post inc/dec if (PlainStore && PlainStore->isIndexed()) return false; + EVT StoreVT = ST->getMemoryVT(); if (!StoreVT.isSimple()) return false; @@ -1888,29 +1893,28 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { unsigned int PointerSize = CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace()); - // Memory Semantic Setting - unsigned int CodeMemorySem = getCodeMemorySemantic(ST, Subtarget); + SDLoc DL(N); + SDValue Chain = ST->getChain(); + auto InstructionOrdering = insertMemoryInstructionFence(DL, Chain, ST); // Vector Setting MVT SimpleVT = StoreVT.getSimpleVT(); - unsigned vecType = NVPTX::PTXLdStInstCode::Scalar; + unsigned VecType = NVPTX::PTXLdStInstCode::Scalar; // Type Setting: toType + toTypeWidth // - for integer type, always use 'u' - // MVT ScalarVT = SimpleVT.getScalarType(); - unsigned toTypeWidth = ScalarVT.getSizeInBits(); + unsigned ToTypeWidth = ScalarVT.getSizeInBits(); if (SimpleVT.isVector()) { assert((Isv2x16VT(StoreVT) || StoreVT == MVT::v4i8) && "Unexpected vector type"); // v2x16 is stored using st.b32 - toTypeWidth = 32; + ToTypeWidth = 32; } - unsigned int toType = getLdStRegType(ScalarVT); + unsigned int ToType = getLdStRegType(ScalarVT); // Create the machine instruction DAG - SDValue Chain = ST->getChain(); SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal(); SDValue BasePtr = ST->getBasePtr(); SDValue Addr; @@ -1919,21 +1923,18 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { MVT::SimpleValueType SourceVT = Value.getNode()->getSimpleValueType(0).SimpleTy; + SmallVector Ops({Value, getI32Imm(InstructionOrdering, DL), + getI32Imm(CodeAddrSpace, DL), + getI32Imm(VecType, DL), getI32Imm(ToType, DL), + getI32Imm(ToTypeWidth, DL)}); + if (SelectDirectAddr(BasePtr, Addr)) { Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar, NVPTX::ST_i32_avar, NVPTX::ST_i64_avar, NVPTX::ST_f32_avar, NVPTX::ST_f64_avar); if (!Opcode) return false; - SDValue Ops[] = {Value, - getI32Imm(CodeMemorySem, dl), - getI32Imm(CodeAddrSpace, dl), - getI32Imm(vecType, dl), - getI32Imm(toType, dl), - getI32Imm(toTypeWidth, dl), - Addr, - Chain}; - NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops); + Ops.append({Addr, Chain}); } else if (PointerSize == 64 ? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset) : SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) { @@ -1942,16 +1943,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { NVPTX::ST_f32_asi, NVPTX::ST_f64_asi); if (!Opcode) return false; - SDValue Ops[] = {Value, - getI32Imm(CodeMemorySem, dl), - getI32Imm(CodeAddrSpace, dl), - getI32Imm(vecType, dl), - getI32Imm(toType, dl), - getI32Imm(toTypeWidth, dl), - Base, - Offset, - Chain}; - NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops); + Ops.append({Base, Offset, Chain}); } else if (PointerSize == 64 ? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset) : SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) { @@ -1966,17 +1958,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { NVPTX::ST_f32_ari, NVPTX::ST_f64_ari); if (!Opcode) return false; - - SDValue Ops[] = {Value, - getI32Imm(CodeMemorySem, dl), - getI32Imm(CodeAddrSpace, dl), - getI32Imm(vecType, dl), - getI32Imm(toType, dl), - getI32Imm(toTypeWidth, dl), - Base, - Offset, - Chain}; - NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops); + Ops.append({Base, Offset, Chain}); } else { if (PointerSize == 64) Opcode = @@ -1989,17 +1971,12 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { NVPTX::ST_f32_areg, NVPTX::ST_f64_areg); if (!Opcode) return false; - SDValue Ops[] = {Value, - getI32Imm(CodeMemorySem, dl), - getI32Imm(CodeAddrSpace, dl), - getI32Imm(vecType, dl), - getI32Imm(toType, dl), - getI32Imm(toTypeWidth, dl), - BasePtr, - Chain}; - NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops); + Ops.append({BasePtr, Chain}); } + SDNode *NVPTXST = NVPTXST = + CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops); + if (!NVPTXST) return false; @@ -2010,11 +1987,9 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { } bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { - SDValue Chain = N->getOperand(0); SDValue Op1 = N->getOperand(1); SDValue Addr, Offset, Base; std::optional Opcode; - SDLoc DL(N); SDNode *ST; EVT EltVT = Op1.getValueType(); MemSDNode *MemSD = cast(N); @@ -2029,8 +2004,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { unsigned int PointerSize = CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace()); - // Memory Semantic Setting - unsigned int CodeMemorySem = getCodeMemorySemantic(MemSD, Subtarget); + SDLoc DL(N); + SDValue Chain = N->getOperand(0); + auto InstructionOrdering = insertMemoryInstructionFence(DL, Chain, MemSD); // Type Setting: toType + toTypeWidth // - for integer type, always use 'u' @@ -2039,23 +2015,20 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { unsigned ToTypeWidth = ScalarVT.getSizeInBits(); unsigned ToType = getLdStRegType(ScalarVT); - SmallVector StOps; + SmallVector Ops; SDValue N2; unsigned VecType; switch (N->getOpcode()) { case NVPTXISD::StoreV2: VecType = NVPTX::PTXLdStInstCode::V2; - StOps.push_back(N->getOperand(1)); - StOps.push_back(N->getOperand(2)); + Ops.append({N->getOperand(1), N->getOperand(2)}); N2 = N->getOperand(3); break; case NVPTXISD::StoreV4: VecType = NVPTX::PTXLdStInstCode::V4; - StOps.push_back(N->getOperand(1)); - StOps.push_back(N->getOperand(2)); - StOps.push_back(N->getOperand(3)); - StOps.push_back(N->getOperand(4)); + Ops.append({N->getOperand(1), N->getOperand(2), N->getOperand(3), + N->getOperand(4)}); N2 = N->getOperand(5); break; default: @@ -2072,11 +2045,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { ToTypeWidth = 32; } - StOps.push_back(getI32Imm(CodeMemorySem, DL)); - StOps.push_back(getI32Imm(CodeAddrSpace, DL)); - StOps.push_back(getI32Imm(VecType, DL)); - StOps.push_back(getI32Imm(ToType, DL)); - StOps.push_back(getI32Imm(ToTypeWidth, DL)); + Ops.append({getI32Imm(InstructionOrdering, DL), getI32Imm(CodeAddrSpace, DL), + getI32Imm(VecType, DL), getI32Imm(ToType, DL), + getI32Imm(ToTypeWidth, DL)}); if (SelectDirectAddr(N2, Addr)) { switch (N->getOpcode()) { @@ -2095,7 +2066,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { NVPTX::STV_f32_v4_avar, std::nullopt); break; } - StOps.push_back(Addr); + Ops.push_back(Addr); } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset) : SelectADDRsi(N2.getNode(), N2, Base, Offset)) { switch (N->getOpcode()) { @@ -2114,8 +2085,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { std::nullopt, NVPTX::STV_f32_v4_asi, std::nullopt); break; } - StOps.push_back(Base); - StOps.push_back(Offset); + Ops.append({Base, Offset}); } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset) : SelectADDRri(N2.getNode(), N2, Base, Offset)) { if (PointerSize == 64) { @@ -2154,8 +2124,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { break; } } - StOps.push_back(Base); - StOps.push_back(Offset); + Ops.append({Base, Offset}); } else { if (PointerSize == 64) { switch (N->getOpcode()) { @@ -2194,15 +2163,15 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { break; } } - StOps.push_back(N2); + Ops.push_back(N2); } if (!Opcode) return false; - StOps.push_back(Chain); + Ops.push_back(Chain); - ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, StOps); + ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops); MachineMemOperand *MemRef = cast(N)->getMemOperand(); CurDAG->setNodeMemRefs(cast(ST), {MemRef}); @@ -2276,10 +2245,8 @@ bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) { unsigned OffsetVal = Offset->getAsZExtVal(); - SmallVector Ops; - Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32)); - Ops.push_back(Chain); - Ops.push_back(Glue); + SmallVector Ops( + {CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain, Glue}); ReplaceNode(Node, CurDAG->getMachineNode(*Opcode, DL, VTs, Ops)); return true; @@ -2312,8 +2279,7 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) { SmallVector Ops; for (unsigned i = 0; i < NumElts; ++i) Ops.push_back(N->getOperand(i + 2)); - Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32)); - Ops.push_back(Chain); + Ops.append({CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain}); // Determine target opcode // If we have an i1, use an 8-bit store. The lowering code in @@ -2493,10 +2459,8 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) { SmallVector Ops; for (unsigned i = 0; i < NumElts; ++i) Ops.push_back(N->getOperand(i + 3)); - Ops.push_back(CurDAG->getTargetConstant(ParamVal, DL, MVT::i32)); - Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32)); - Ops.push_back(Chain); - Ops.push_back(Glue); + Ops.append({CurDAG->getTargetConstant(ParamVal, DL, MVT::i32), + CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain, Glue}); // Determine target opcode // If we have an i1, use an 8-bit store. The lowering code in diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index 49626d4051485..eac4056599511 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -99,6 +99,9 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { bool ChkMemSDNodeAddressSpace(SDNode *N, unsigned int spN) const; static unsigned GetConvertOpcode(MVT DestTy, MVT SrcTy, LoadSDNode *N); + + NVPTX::Ordering insertMemoryInstructionFence(SDLoc DL, SDValue &Chain, + MemSDNode *N); }; class NVPTXDAGToDAGISelLegacy : public SelectionDAGISelLegacy { diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 91b239a52d17f..8f8ecac933b4d 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -3930,7 +3930,6 @@ def : Pat<(atomic_fence (i64 6), (i64 1)), (atomic_thread_fence_acq_rel_sys)>, / def : Pat<(atomic_fence (i64 7), (i64 1)), (atomic_thread_fence_seq_cst_sys)>, // seq_cst(7) sys(1) Requires<[hasPTX<60>, hasSM<70>]>; - // If PTX<60 or SM<70, we fall back to MEMBAR: def : Pat<(atomic_fence (i64 4), (i64 1)), (INT_MEMBAR_SYS)>; // acquire(4) sys(1) def : Pat<(atomic_fence (i64 5), (i64 1)), (INT_MEMBAR_SYS)>; // release(5) sys(1) diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h index c15ff6cae1f27..eebd91fefe4f0 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h @@ -13,6 +13,7 @@ #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H #define LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H +#include "NVPTX.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalVariable.h" @@ -82,6 +83,36 @@ inline unsigned promoteScalarArgumentSize(unsigned size) { bool shouldEmitPTXNoReturn(const Value *V, const TargetMachine &TM); bool Isv2x16VT(EVT VT); + +namespace NVPTX { + +inline std::string OrderingToCString(Ordering Order) { + switch (Order) { + case Ordering::NotAtomic: + return "NotAtomic"; + case Ordering::Relaxed: + return "Relaxed"; + case Ordering::Acquire: + return "Acquire"; + case Ordering::Release: + return "Release"; + // case Ordering::AcquireRelease: return "AcquireRelease"; + case Ordering::SequentiallyConsistent: + return "SequentiallyConsistent"; + case Ordering::Volatile: + return "Volatile"; + case Ordering::RelaxedMMIO: + return "RelaxedMMIO"; + } + report_fatal_error("unknown ordering"); +} + +inline raw_ostream &operator<<(raw_ostream &O, Ordering Order) { + O << OrderingToCString(Order); + return O; } +} // namespace NVPTX +} // namespace llvm + #endif diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll index 68915b0f2698b..9cea33d12027f 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll @@ -1,169 +1,7 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | FileCheck %s ; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify -arch=sm_70 %} -; CHECK-LABEL: generic_plain -define void @generic_plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { - ; CHECK: ld.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load i8, ptr %a - %a.add = add i8 %a.load, 1 - ; CHECK: st.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store i8 %a.add, ptr %a - - ; CHECK: ld.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load i16, ptr %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store i16 %b.add, ptr %b - - ; CHECK: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load i32, ptr %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store i32 %c.add, ptr %c - - ; CHECK: ld.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load i64, ptr %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store i64 %d.add, ptr %d - - ; CHECK: ld.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load float, ptr %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store float %e.add, ptr %c - - ; CHECK: ld.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load double, ptr %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store double %f.add, ptr %c - - ret void -} - -; CHECK-LABEL: generic_volatile -define void @generic_volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { - ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load volatile i8, ptr %a - %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store volatile i8 %a.add, ptr %a - - ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load volatile i16, ptr %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store volatile i16 %b.add, ptr %b - - ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load volatile i32, ptr %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store volatile i32 %c.add, ptr %c - - ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load volatile i64, ptr %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store volatile i64 %d.add, ptr %d - - ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load volatile float, ptr %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store volatile float %e.add, ptr %c - - ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load volatile double, ptr %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store volatile double %f.add, ptr %c - - ret void -} - -; CHECK-LABEL: generic_unordered -define void @generic_unordered(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr %a unordered, align 1 - %a.add = add i8 %a.load, 1 - ; CHECK: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr %a unordered, align 1 - - ; CHECK: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr %b unordered, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr %b unordered, align 2 - - ; CHECK: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr %c unordered, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr %c unordered, align 4 - - ; CHECK: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr %d unordered, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr %d unordered, align 8 - - ; CHECK: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr %e unordered, align 4 - - ; CHECK: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr %e unordered, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr %e unordered, align 8 - - ret void -} - -; CHECK-LABEL: generic_monotonic -define void @generic_monotonic(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr %a monotonic, align 1 - %a.add = add i8 %a.load, 1 - ; CHECK: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr %a monotonic, align 1 - - ; CHECK: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr %b monotonic, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr %b monotonic, align 2 - - ; CHECK: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr %c monotonic, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr %c monotonic, align 4 - - ; CHECK: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr %d monotonic, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr %d monotonic, align 8 - - ; CHECK: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr %e monotonic, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr %e monotonic, align 4 - - ; CHECK: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr %e monotonic, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr %e monotonic, align 8 - - ret void -} +;; generic statespace ; CHECK-LABEL: generic_acq_rel define void @generic_acq_rel(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { @@ -206,335 +44,154 @@ define void @generic_acq_rel(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnam ret void } -; CHECK-LABEL: generic_unordered_volatile -define void @generic_unordered_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr %a unordered, align 1 - %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr %a unordered, align 1 - - ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr %b unordered, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr %b unordered, align 2 - - ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr %c unordered, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr %c unordered, align 4 - - ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr %d unordered, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr %d unordered, align 8 - - ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr %e unordered, align 4 - - ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr %e unordered, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr %e unordered, align 8 - - ret void -} - -; CHECK-LABEL: generic_monotonic_volatile -define void @generic_monotonic_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr %a monotonic, align 1 - %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr %a monotonic, align 1 - - ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr %b monotonic, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr %b monotonic, align 2 - - ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr %c monotonic, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr %c monotonic, align 4 - - ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr %d monotonic, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr %d monotonic, align 8 - - ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr %e monotonic, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr %e monotonic, align 4 - - ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr %e monotonic, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr %e monotonic, align 8 - - ret void -} - -;; global statespace - -; CHECK-LABEL: global_plain -define void @global_plain(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr { - ; CHECK: ld.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load i8, ptr addrspace(1) %a - %a.add = add i8 %a.load, 1 - ; CHECK: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store i8 %a.add, ptr addrspace(1) %a - - ; CHECK: ld.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load i16, ptr addrspace(1) %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store i16 %b.add, ptr addrspace(1) %b - - ; CHECK: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load i32, ptr addrspace(1) %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store i32 %c.add, ptr addrspace(1) %c - - ; CHECK: ld.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load i64, ptr addrspace(1) %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store i64 %d.add, ptr addrspace(1) %d - - ; CHECK: ld.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load float, ptr addrspace(1) %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store float %e.add, ptr addrspace(1) %c - - ; CHECK: ld.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load double, ptr addrspace(1) %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store double %f.add, ptr addrspace(1) %c - - ret void -} - -; CHECK-LABEL: global_volatile -define void @global_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr { - ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load volatile i8, ptr addrspace(1) %a - %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store volatile i8 %a.add, ptr addrspace(1) %a - - ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load volatile i16, ptr addrspace(1) %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store volatile i16 %b.add, ptr addrspace(1) %b - - ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load volatile i32, ptr addrspace(1) %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store volatile i32 %c.add, ptr addrspace(1) %c - - ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load volatile i64, ptr addrspace(1) %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store volatile i64 %d.add, ptr addrspace(1) %d - - ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load volatile float, ptr addrspace(1) %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store volatile float %e.add, ptr addrspace(1) %c - - ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load volatile double, ptr addrspace(1) %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store volatile double %f.add, ptr addrspace(1) %c - - ret void -} - -; CHECK-LABEL: global_unordered -define void @global_unordered(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(1) %a unordered, align 1 +; CHECK-LABEL: generic_acq_rel_volatile +define void @generic_acq_rel_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr %a acquire, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(1) %a unordered, align 1 + ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr %a release, align 1 - ; CHECK: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(1) %b unordered, align 2 + ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr %b acquire, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(1) %b unordered, align 2 + ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr %b release, align 2 - ; CHECK: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(1) %c unordered, align 4 + ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr %c acquire, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(1) %c unordered, align 4 + ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr %c release, align 4 - ; CHECK: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(1) %d unordered, align 8 + ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr %d acquire, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(1) %d unordered, align 8 + ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr %d release, align 8 - ; CHECK: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(1) %e unordered, align 4 + ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr %e acquire, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(1) %e unordered, align 4 + ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr %e release, align 4 - ; CHECK: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(1) %e unordered, align 8 + ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr %e acquire, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(1) %e unordered, align 8 + ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr %e release, align 8 ret void } -; CHECK-LABEL: global_monotonic -define void @global_monotonic(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(1) %a monotonic, align 1 +; CHECK-LABEL: generic_sc +define void @generic_sc(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr %a seq_cst, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(1) %a monotonic, align 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr %a seq_cst, align 1 - ; CHECK: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(1) %b monotonic, align 2 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr %b seq_cst, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(1) %b monotonic, align 2 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr %b seq_cst, align 2 - ; CHECK: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(1) %c monotonic, align 4 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr %c seq_cst, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(1) %c monotonic, align 4 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr %c seq_cst, align 4 - ; CHECK: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(1) %d monotonic, align 8 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr %d seq_cst, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(1) %d monotonic, align 8 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr %d seq_cst, align 8 - ; CHECK: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(1) %e monotonic, align 4 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr %e seq_cst, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(1) %e monotonic, align 4 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr %e seq_cst, align 4 - ; CHECK: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(1) %e monotonic, align 8 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr %e seq_cst, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(1) %e monotonic, align 8 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr %e seq_cst, align 8 ret void } -; CHECK-LABEL: global_unordered_volatile -define void @global_unordered_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(1) %a unordered, align 1 +; CHECK-LABEL: generic_sc_volatile +define void @generic_sc_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr %a seq_cst, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(1) %a unordered, align 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr %a seq_cst, align 1 - ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(1) %b unordered, align 2 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr %b seq_cst, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(1) %b unordered, align 2 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr %b seq_cst, align 2 - ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(1) %c unordered, align 4 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr %c seq_cst, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(1) %c unordered, align 4 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr %c seq_cst, align 4 - ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(1) %d unordered, align 8 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr %d seq_cst, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(1) %d unordered, align 8 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr %d seq_cst, align 8 - ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(1) %e unordered, align 4 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr %e seq_cst, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(1) %e unordered, align 4 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr %e seq_cst, align 4 - ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(1) %e unordered, align 8 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr %e seq_cst, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(1) %e unordered, align 8 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr %e seq_cst, align 8 ret void } -; CHECK-LABEL: global_monotonic_volatile -define void @global_monotonic_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(1) %a monotonic, align 1 - %a.add = add i8 %a.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(1) %a monotonic, align 1 - - ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(1) %b monotonic, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(1) %b monotonic, align 2 - - ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(1) %c monotonic, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(1) %c monotonic, align 4 - - ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(1) %d monotonic, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(1) %d monotonic, align 8 - - ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(1) %e monotonic, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(1) %e monotonic, align 4 - - ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(1) %e monotonic, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(1) %e monotonic, align 8 - - ret void -} +;; global statespace ; CHECK-LABEL: global_acq_rel define void @global_acq_rel(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { @@ -618,253 +275,113 @@ define void @global_acq_rel_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, p ret void } -;; shared statespace - -; CHECK-LABEL: shared_plain -define void @shared_plain(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr { - ; CHECK: ld.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load i8, ptr addrspace(3) %a - %a.add = add i8 %a.load, 1 - ; CHECK: st.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store i8 %a.add, ptr addrspace(3) %a - - ; CHECK: ld.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load i16, ptr addrspace(3) %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store i16 %b.add, ptr addrspace(3) %b - - ; CHECK: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load i32, ptr addrspace(3) %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store i32 %c.add, ptr addrspace(3) %c - - ; CHECK: ld.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load i64, ptr addrspace(3) %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store i64 %d.add, ptr addrspace(3) %d - - ; CHECK: ld.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load float, ptr addrspace(3) %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store float %e.add, ptr addrspace(3) %c - - ; CHECK: ld.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load double, ptr addrspace(3) %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store double %f.add, ptr addrspace(3) %c - - ret void -} - -; CHECK-LABEL: shared_volatile -define void @shared_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr { - ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load volatile i8, ptr addrspace(3) %a - %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store volatile i8 %a.add, ptr addrspace(3) %a - - ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load volatile i16, ptr addrspace(3) %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store volatile i16 %b.add, ptr addrspace(3) %b - - ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load volatile i32, ptr addrspace(3) %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store volatile i32 %c.add, ptr addrspace(3) %c - - ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load volatile i64, ptr addrspace(3) %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store volatile i64 %d.add, ptr addrspace(3) %d - - ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load volatile float, ptr addrspace(3) %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store volatile float %e.add, ptr addrspace(3) %c - - ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load volatile double, ptr addrspace(3) %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store volatile double %f.add, ptr addrspace(3) %c - - ret void -} - -; CHECK-LABEL: shared_unordered -define void @shared_unordered(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(3) %a unordered, align 1 - %a.add = add i8 %a.load, 1 - ; CHECK: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(3) %a unordered, align 1 - - ; CHECK: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(3) %b unordered, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(3) %b unordered, align 2 - - ; CHECK: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(3) %c unordered, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(3) %c unordered, align 4 - - ; CHECK: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(3) %d unordered, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(3) %d unordered, align 8 - - ; CHECK: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(3) %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(3) %e unordered, align 4 - - ; CHECK: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(3) %e unordered, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(3) %e unordered, align 8 - - ret void -} - -; CHECK-LABEL: shared_unordered_volatile -define void @shared_unordered_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(3) %a unordered, align 1 +; CHECK-LABEL: global_seq_cst +define void @global_seq_cst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(1) %a seq_cst, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(3) %a unordered, align 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(1) %a seq_cst, align 1 - ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(3) %b unordered, align 2 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(1) %b seq_cst, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(3) %b unordered, align 2 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(1) %b seq_cst, align 2 - ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(3) %c unordered, align 4 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(1) %c seq_cst, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(3) %c unordered, align 4 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(1) %c seq_cst, align 4 - ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(3) %d unordered, align 8 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(1) %d seq_cst, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(3) %d unordered, align 8 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(1) %d seq_cst, align 8 - ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(3) %e unordered, align 4 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(1) %e seq_cst, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(3) %e unordered, align 4 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(1) %e seq_cst, align 4 - ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(3) %e unordered, align 8 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(1) %e seq_cst, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(3) %e unordered, align 8 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(1) %e seq_cst, align 8 ret void } -; CHECK-LABEL: shared_monotonic -define void @shared_monotonic(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(3) %a monotonic, align 1 +; CHECK-LABEL: global_seq_cst_volatile +define void @global_seq_cst_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(1) %a seq_cst, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(3) %a monotonic, align 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(1) %a seq_cst, align 1 - ; CHECK: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(3) %b monotonic, align 2 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(1) %b seq_cst, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(3) %b monotonic, align 2 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(1) %b seq_cst, align 2 - ; CHECK: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(3) %c monotonic, align 4 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(1) %c seq_cst, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(3) %c monotonic, align 4 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(1) %c seq_cst, align 4 - ; CHECK: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(3) %d monotonic, align 8 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(1) %d seq_cst, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(3) %d monotonic, align 8 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(1) %d seq_cst, align 8 - ; CHECK: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(3) %e monotonic, align 4 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(1) %e seq_cst, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(3) %e monotonic, align 4 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(1) %e seq_cst, align 4 - ; CHECK: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(3) %e monotonic, align 8 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(1) %e seq_cst, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(3) %e monotonic, align 8 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(1) %e seq_cst, align 8 ret void } -; CHECK-LABEL: shared_monotonic_volatile -define void @shared_monotonic_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(3) %a monotonic, align 1 - %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(3) %a monotonic, align 1 - - ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(3) %b monotonic, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(3) %b monotonic, align 2 - - ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(3) %c monotonic, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(3) %c monotonic, align 4 - - ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(3) %d monotonic, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(3) %d monotonic, align 8 - - ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(3) %e monotonic, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(3) %e monotonic, align 4 - - ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(3) %e monotonic, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(3) %e monotonic, align 8 - - ret void -} +;; shared statespace ; CHECK-LABEL: shared_acq_rel define void @shared_acq_rel(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { @@ -948,332 +465,291 @@ define void @shared_acq_rel_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, p ret void } -;; local statespace - -; CHECK-LABEL: local_plain -define void @local_plain(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load i8, ptr addrspace(5) %a - %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store i8 %a.add, ptr addrspace(5) %a - - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load i16, ptr addrspace(5) %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store i16 %b.add, ptr addrspace(5) %b - - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load i32, ptr addrspace(5) %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store i32 %c.add, ptr addrspace(5) %c - - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load i64, ptr addrspace(5) %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store i64 %d.add, ptr addrspace(5) %d - - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load float, ptr addrspace(5) %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store float %e.add, ptr addrspace(5) %c - - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load double, ptr addrspace(5) %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store double %f.add, ptr addrspace(5) %c - - ret void -} - -; CHECK-LABEL: local_volatile -define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load volatile i8, ptr addrspace(5) %a +; CHECK-LABEL: shared_seq_cst +define void @shared_seq_cst(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(3) %a seq_cst, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store volatile i8 %a.add, ptr addrspace(5) %a + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(3) %a seq_cst, align 1 - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load volatile i16, ptr addrspace(5) %b + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(3) %b seq_cst, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store volatile i16 %b.add, ptr addrspace(5) %b + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(3) %b seq_cst, align 2 - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load volatile i32, ptr addrspace(5) %c + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(3) %c seq_cst, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store volatile i32 %c.add, ptr addrspace(5) %c + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(3) %c seq_cst, align 4 - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load volatile i64, ptr addrspace(5) %d + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(3) %d seq_cst, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store volatile i64 %d.add, ptr addrspace(5) %d + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(3) %d seq_cst, align 8 - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load volatile float, ptr addrspace(5) %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store volatile float %e.add, ptr addrspace(5) %c + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(3) %e seq_cst, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(3) %e seq_cst, align 4 - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load volatile double, ptr addrspace(5) %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store volatile double %f.add, ptr addrspace(5) %c + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(3) %e seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(3) %e seq_cst, align 8 ret void } -; CHECK-LABEL: local_unordered -define void @local_unordered(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(5) %a unordered, align 1 +; CHECK-LABEL: shared_seq_cst_volatile +define void @shared_seq_cst_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(3) %a seq_cst, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(5) %a unordered, align 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(3) %a seq_cst, align 1 - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(5) %b unordered, align 2 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(3) %b seq_cst, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(5) %b unordered, align 2 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(3) %b seq_cst, align 2 - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(5) %c unordered, align 4 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(3) %c seq_cst, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(5) %c unordered, align 4 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(3) %c seq_cst, align 4 - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(5) %d unordered, align 8 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(3) %d seq_cst, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(5) %d unordered, align 8 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(3) %d seq_cst, align 8 - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(5) %e unordered, align 4 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(3) %e seq_cst, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(5) %e unordered, align 4 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(3) %e seq_cst, align 4 - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(5) %e unordered, align 8 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(3) %e seq_cst, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(5) %e unordered, align 8 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(3) %e seq_cst, align 8 ret void } -; CHECK-LABEL: local_unordered_volatile -define void @local_unordered_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(5) %a unordered, align 1 - %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(5) %a unordered, align 1 - - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(5) %b unordered, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(5) %b unordered, align 2 - - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(5) %c unordered, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(5) %c unordered, align 4 - - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(5) %d unordered, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(5) %d unordered, align 8 - - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(5) %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(5) %e unordered, align 4 - - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(5) %e unordered, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(5) %e unordered, align 8 +;; local statespace - ret void -} +; CHECK-LABEL: local_acq_rel +define void @local_acq_rel(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; TODO: generate PTX that preserves Concurrent Forward Progress + ; by using PTX atomic operations. -; CHECK-LABEL: local_monotonic -define void @local_monotonic(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1 + %a.load = load atomic i8, ptr addrspace(5) %a acquire, align 1 %a.add = add i8 %a.load, 1 ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(5) %a monotonic, align 1 + store atomic i8 %a.add, ptr addrspace(5) %a release, align 1 ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(5) %b monotonic, align 2 + %b.load = load atomic i16, ptr addrspace(5) %b acquire, align 2 %b.add = add i16 %b.load, 1 ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(5) %b monotonic, align 2 + store atomic i16 %b.add, ptr addrspace(5) %b release, align 2 ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(5) %c monotonic, align 4 + %c.load = load atomic i32, ptr addrspace(5) %c acquire, align 4 %c.add = add i32 %c.load, 1 ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(5) %c monotonic, align 4 + store atomic i32 %c.add, ptr addrspace(5) %c release, align 4 ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(5) %d monotonic, align 8 + %d.load = load atomic i64, ptr addrspace(5) %d acquire, align 8 %d.add = add i64 %d.load, 1 ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(5) %d monotonic, align 8 + store atomic i64 %d.add, ptr addrspace(5) %d release, align 8 ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(5) %e monotonic, align 4 + %e.load = load atomic float, ptr addrspace(5) %e acquire, align 4 %e.add = fadd float %e.load, 1.0 ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(5) %e monotonic, align 4 + store atomic float %e.add, ptr addrspace(5) %e release, align 4 ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(5) %e monotonic, align 8 + %f.load = load atomic double, ptr addrspace(5) %e acquire, align 8 %f.add = fadd double %f.load, 1. ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(5) %e monotonic, align 8 + store atomic double %f.add, ptr addrspace(5) %e release, align 8 ret void } -; CHECK-LABEL: local_monotonic_volatile -define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { +; CHECK-LABEL: local_acq_rel_volatile +define void @local_acq_rel_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; TODO: generate PTX that preserves Concurrent Forward Progress + ; by using PTX atomic operations. + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1 + %a.load = load atomic volatile i8, ptr addrspace(5) %a acquire, align 1 %a.add = add i8 %a.load, 1 ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(5) %a monotonic, align 1 + store atomic volatile i8 %a.add, ptr addrspace(5) %a release, align 1 ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(5) %b monotonic, align 2 + %b.load = load atomic volatile i16, ptr addrspace(5) %b acquire, align 2 %b.add = add i16 %b.load, 1 ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(5) %b monotonic, align 2 + store atomic volatile i16 %b.add, ptr addrspace(5) %b release, align 2 ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(5) %c monotonic, align 4 + %c.load = load atomic volatile i32, ptr addrspace(5) %c acquire, align 4 %c.add = add i32 %c.load, 1 ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(5) %c monotonic, align 4 + store atomic volatile i32 %c.add, ptr addrspace(5) %c release, align 4 ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(5) %d monotonic, align 8 + %d.load = load atomic volatile i64, ptr addrspace(5) %d acquire, align 8 %d.add = add i64 %d.load, 1 ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(5) %d monotonic, align 8 + store atomic volatile i64 %d.add, ptr addrspace(5) %d release, align 8 ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(5) %e monotonic, align 4 + %e.load = load atomic volatile float, ptr addrspace(5) %e acquire, align 4 %e.add = fadd float %e.load, 1.0 ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(5) %e monotonic, align 4 + store atomic volatile float %e.add, ptr addrspace(5) %e release, align 4 ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(5) %e monotonic, align 8 + %f.load = load atomic volatile double, ptr addrspace(5) %e acquire, align 8 %f.add = fadd double %f.load, 1. ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(5) %e monotonic, align 8 + store atomic volatile double %f.add, ptr addrspace(5) %e release, align 8 ret void } -; CHECK-LABEL: local_acq_rel -define void @local_acq_rel(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { +; CHECK-LABEL: local_seq_cst +define void @local_seq_cst(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; TODO: generate PTX that preserves Concurrent Forward Progress + ; by using PTX atomic operations. + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(5) %a acquire, align 1 + %a.load = load atomic i8, ptr addrspace(5) %a seq_cst, align 1 %a.add = add i8 %a.load, 1 ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(5) %a release, align 1 + store atomic i8 %a.add, ptr addrspace(5) %a seq_cst, align 1 ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(5) %b acquire, align 2 + %b.load = load atomic i16, ptr addrspace(5) %b seq_cst, align 2 %b.add = add i16 %b.load, 1 ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(5) %b release, align 2 + store atomic i16 %b.add, ptr addrspace(5) %b seq_cst, align 2 ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(5) %c acquire, align 4 + %c.load = load atomic i32, ptr addrspace(5) %c seq_cst, align 4 %c.add = add i32 %c.load, 1 ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(5) %c release, align 4 + store atomic i32 %c.add, ptr addrspace(5) %c seq_cst, align 4 ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(5) %d acquire, align 8 + %d.load = load atomic i64, ptr addrspace(5) %d seq_cst, align 8 %d.add = add i64 %d.load, 1 ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(5) %d release, align 8 + store atomic i64 %d.add, ptr addrspace(5) %d seq_cst, align 8 ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(5) %e acquire, align 4 + %e.load = load atomic float, ptr addrspace(5) %e seq_cst, align 4 %e.add = fadd float %e.load, 1.0 ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(5) %e release, align 4 + store atomic float %e.add, ptr addrspace(5) %e seq_cst, align 4 ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(5) %e acquire, align 8 + %f.load = load atomic double, ptr addrspace(5) %e seq_cst, align 8 %f.add = fadd double %f.load, 1. ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(5) %e release, align 8 + store atomic double %f.add, ptr addrspace(5) %e seq_cst, align 8 ret void } -; CHECK-LABEL: local_acq_rel_volatile -define void @local_acq_rel_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { +; CHECK-LABEL: local_seq_cst_volatile +define void @local_seq_cst_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; TODO: generate PTX that preserves Concurrent Forward Progress + ; by using PTX atomic operations. + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(5) %a acquire, align 1 + %a.load = load atomic volatile i8, ptr addrspace(5) %a seq_cst, align 1 %a.add = add i8 %a.load, 1 ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(5) %a release, align 1 + store atomic volatile i8 %a.add, ptr addrspace(5) %a seq_cst, align 1 ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(5) %b acquire, align 2 + %b.load = load atomic volatile i16, ptr addrspace(5) %b seq_cst, align 2 %b.add = add i16 %b.load, 1 ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(5) %b release, align 2 + store atomic volatile i16 %b.add, ptr addrspace(5) %b seq_cst, align 2 ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(5) %c acquire, align 4 + %c.load = load atomic volatile i32, ptr addrspace(5) %c seq_cst, align 4 %c.add = add i32 %c.load, 1 ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(5) %c release, align 4 + store atomic volatile i32 %c.add, ptr addrspace(5) %c seq_cst, align 4 ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(5) %d acquire, align 8 + %d.load = load atomic volatile i64, ptr addrspace(5) %d seq_cst, align 8 %d.add = add i64 %d.load, 1 ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(5) %d release, align 8 + store atomic volatile i64 %d.add, ptr addrspace(5) %d seq_cst, align 8 ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(5) %e acquire, align 4 + %e.load = load atomic volatile float, ptr addrspace(5) %e seq_cst, align 4 %e.add = fadd float %e.load, 1.0 ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(5) %e release, align 4 + store atomic volatile float %e.add, ptr addrspace(5) %e seq_cst, align 4 ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(5) %e acquire, align 8 + %f.load = load atomic volatile double, ptr addrspace(5) %e seq_cst, align 8 %f.add = fadd double %f.load, 1. ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(5) %e release, align 8 + store atomic volatile double %f.add, ptr addrspace(5) %e seq_cst, align 8 + + ; TODO: LLVM IR Verifier does not support atomics on vector types. ret void } + +; TODO: add plain,atomic,volatile,atomic volatile tests +; for .const and .param statespaces \ No newline at end of file diff --git a/llvm/test/CodeGen/NVPTX/load-store.ll b/llvm/test/CodeGen/NVPTX/load-store.ll index 4c5e0920ce1ae..aac73f71a6766 100644 --- a/llvm/test/CodeGen/NVPTX/load-store.ll +++ b/llvm/test/CodeGen/NVPTX/load-store.ll @@ -1,5 +1,13 @@ -; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck -check-prefixes=CHECK,SM60 %s ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} +; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | FileCheck %s -check-prefixes=CHECK,SM70 +; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify -arch=sm_70 %} + +; TODO: add i1, <8 x i8>, and <6 x i8> vector tests. + +; TODO: add test for vectors that exceed 128-bit length +; Per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors +; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed. ; generic statespace @@ -36,10 +44,76 @@ define void @generic_plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { store float %e.add, ptr %c ; CHECK: ld.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load double, ptr %c + %f.load = load double, ptr %d %f.add = fadd double %f.load, 1. ; CHECK: st.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store double %f.add, ptr %c + store double %f.add, ptr %d + + ; TODO: make the lowering of this weak vector ops consistent with + ; the ones of the next tests. This test lowers to a weak PTX + ; vector op, but next test lowers to a vector PTX op. + ; CHECK: ld.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %h.load = load <2 x i8>, ptr %b + %h.add = add <2 x i8> %h.load, + ; CHECK: st.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} + store <2 x i8> %h.add, ptr %b + + ; TODO: make the lowering of this weak vector ops consistent with + ; the ones of the previous test. This test lowers to a weak + ; PTX scalar op, but prior test lowers to a vector PTX op. + ; CHECK: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %i.load = load <4 x i8>, ptr %c + %i.add = add <4 x i8> %i.load, + ; CHECK: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store <4 x i8> %i.add, ptr %c + + ; CHECK: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %j.load = load <2 x i16>, ptr %c + %j.add = add <2 x i16> %j.load, + ; CHECK: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store <2 x i16> %j.add, ptr %c + + ; CHECK: ld.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %k.load = load <4 x i16>, ptr %d + %k.add = add <4 x i16> %k.load, + ; CHECK: st.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} + store <4 x i16> %k.add, ptr %d + + ; CHECK: ld.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %l.load = load <2 x i32>, ptr %d + %l.add = add <2 x i32> %l.load, + ; CHECK: st.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} + store <2 x i32> %l.add, ptr %d + + ; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %m.load = load <4 x i32>, ptr %d + %m.add = add <4 x i32> %m.load, + ; CHECK: st.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} + store <4 x i32> %m.add, ptr %d + + ; CHECK: ld.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %n.load = load <2 x i64>, ptr %d + %n.add = add <2 x i64> %n.load, + ; CHECK: st.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} + store <2 x i64> %n.add, ptr %d + + ; CHECK: ld.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %o.load = load <2 x float>, ptr %d + %o.add = fadd <2 x float> %o.load, + ; CHECK: st.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} + store <2 x float> %o.add, ptr %d + + ; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %p.load = load <4 x float>, ptr %d + %p.add = fadd <4 x float> %p.load, + ; CHECK: st.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} + store <4 x float> %p.add, ptr %d + + ; CHECK: ld.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %q.load = load <2 x double>, ptr %d + %q.add = fadd <2 x double> %q.load, + ; CHECK: st.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} + store <2 x double> %q.add, ptr %d ret void } @@ -82,45 +156,136 @@ define void @generic_volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store volatile double %f.add, ptr %c + ; TODO: volatile, atomic, and volatile atomic memory operations on vector types. + ; Currently, LLVM: + ; - does not allow atomic operations on vectors. + ; - it allows volatile operations but not clear what that means. + ; Following both semantics make sense in general and PTX supports both: + ; - volatile/atomic/volatile atomic applies to the whole vector + ; - volatile/atomic/volatile atomic applies elementwise + ; Actions required: + ; - clarify LLVM semantics for volatile on vectors and align the NVPTX backend with those + ; Below tests show that the current implementation picks the semantics in an inconsistent way + ; * volatile <2 x i8> lowers to "elementwise volatile" + ; * <4 x i8> lowers to "full vector volatile" + ; - provide support for vector atomics, e.g., by extending LLVM IR or via intrinsics + ; - update tests in load-store-sm70.ll as well. + + ; TODO: make this operation consistent with the one for <4 x i8> + ; This operation lowers to a "element wise volatile PTX operation". + ; CHECK: ld.volatile.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %h.load = load volatile <2 x i8>, ptr %b + %h.add = add <2 x i8> %h.load, + ; CHECK: st.volatile.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} + store volatile <2 x i8> %h.add, ptr %b + + ; TODO: make this operation consistent with the one for <2 x i8> + ; This operation lowers to a "full vector volatile PTX operation". + ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %i.load = load volatile <4 x i8>, ptr %c + %i.add = add <4 x i8> %i.load, + ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile <4 x i8> %i.add, ptr %c + + ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %j.load = load volatile <2 x i16>, ptr %c + %j.add = add <2 x i16> %j.load, + ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile <2 x i16> %j.add, ptr %c + + ; CHECK: ld.volatile.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %k.load = load volatile <4 x i16>, ptr %d + %k.add = add <4 x i16> %k.load, + ; CHECK: st.volatile.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} + store volatile <4 x i16> %k.add, ptr %d + + ; CHECK: ld.volatile.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %l.load = load volatile <2 x i32>, ptr %d + %l.add = add <2 x i32> %l.load, + ; CHECK: st.volatile.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} + store volatile <2 x i32> %l.add, ptr %d + + ; CHECK: ld.volatile.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %m.load = load volatile <4 x i32>, ptr %d + %m.add = add <4 x i32> %m.load, + ; CHECK: st.volatile.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} + store volatile <4 x i32> %m.add, ptr %d + + ; CHECK: ld.volatile.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %n.load = load volatile <2 x i64>, ptr %d + %n.add = add <2 x i64> %n.load, + ; CHECK: st.volatile.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} + store volatile <2 x i64> %n.add, ptr %d + + ; CHECK: ld.volatile.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %o.load = load volatile <2 x float>, ptr %d + %o.add = fadd <2 x float> %o.load, + ; CHECK: st.volatile.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} + store volatile <2 x float> %o.add, ptr %d + + ; CHECK: ld.volatile.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %p.load = load volatile <4 x float>, ptr %d + %p.add = fadd <4 x float> %p.load, + ; CHECK: st.volatile.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} + store volatile <4 x float> %p.add, ptr %d + + ; CHECK: ld.volatile.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %q.load = load volatile <2 x double>, ptr %d + %q.add = fadd <2 x double> %q.load, + ; CHECK: st.volatile.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} + store volatile <2 x double> %q.add, ptr %d + ret void } ; CHECK-LABEL: generic_monotonic define void @generic_monotonic(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr %a monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr %a monotonic, align 1 - ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr %b monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr %b monotonic, align 2 - ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr %c monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM60: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM70: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr %c monotonic, align 4 - ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr %d monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM60: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM70: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr %d monotonic, align 8 - ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr %e monotonic, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM60: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM70: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr %e monotonic, align 4 - ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr %e monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM60: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM70: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr %e monotonic, align 8 ret void @@ -169,40 +334,52 @@ define void @generic_monotonic_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) ; CHECK-LABEL: generic_unordered define void @generic_unordered(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr %a unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr %a unordered, align 1 - ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr %b unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr %b unordered, align 2 - ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr %c unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM60: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM70: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr %c unordered, align 4 - ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr %d unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM60: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM70: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr %d unordered, align 8 - ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr %e unordered, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM60: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM70: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr %e unordered, align 4 - ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr %e unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM60: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM70: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr %e unordered, align 8 ret void @@ -289,6 +466,66 @@ define void @global_plain(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspac ; CHECK: st.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store double %f.add, ptr addrspace(1) %c + ; CHECK: ld.global.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %h.load = load <2 x i8>, ptr addrspace(1) %b + %h.add = add <2 x i8> %h.load, + ; CHECK: st.global.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} + store <2 x i8> %h.add, ptr addrspace(1) %b + + ; CHECK: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %i.load = load <4 x i8>, ptr addrspace(1) %c + %i.add = add <4 x i8> %i.load, + ; CHECK: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store <4 x i8> %i.add, ptr addrspace(1) %c + + ; CHECK: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %j.load = load <2 x i16>, ptr addrspace(1) %c + %j.add = add <2 x i16> %j.load, + ; CHECK: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store <2 x i16> %j.add, ptr addrspace(1) %c + + ; CHECK: ld.global.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %k.load = load <4 x i16>, ptr addrspace(1) %d + %k.add = add <4 x i16> %k.load, + ; CHECK: st.global.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} + store <4 x i16> %k.add, ptr addrspace(1) %d + + ; CHECK: ld.global.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %l.load = load <2 x i32>, ptr addrspace(1) %d + %l.add = add <2 x i32> %l.load, + ; CHECK: st.global.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} + store <2 x i32> %l.add, ptr addrspace(1) %d + + ; CHECK: ld.global.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %m.load = load <4 x i32>, ptr addrspace(1) %d + %m.add = add <4 x i32> %m.load, + ; CHECK: st.global.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} + store <4 x i32> %m.add, ptr addrspace(1) %d + + ; CHECK: ld.global.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %n.load = load <2 x i64>, ptr addrspace(1) %d + %n.add = add <2 x i64> %n.load, + ; CHECK: st.global.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} + store <2 x i64> %n.add, ptr addrspace(1) %d + + ; CHECK: ld.global.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %o.load = load <2 x float>, ptr addrspace(1) %d + %o.add = fadd <2 x float> %o.load, + ; CHECK: st.global.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} + store <2 x float> %o.add, ptr addrspace(1) %d + + ; CHECK: ld.global.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %p.load = load <4 x float>, ptr addrspace(1) %d + %p.add = fadd <4 x float> %p.load, + ; CHECK: st.global.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} + store <4 x float> %p.add, ptr addrspace(1) %d + + ; CHECK: ld.global.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %q.load = load <2 x double>, ptr addrspace(1) %d + %q.add = fadd <2 x double> %q.load, + ; CHECK: st.global.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} + store <2 x double> %q.add, ptr addrspace(1) %d + ret void } @@ -330,45 +567,117 @@ define void @global_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrs ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store volatile double %f.add, ptr addrspace(1) %c + ; CHECK: ld.volatile.global.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %h.load = load volatile <2 x i8>, ptr addrspace(1) %b + %h.add = add <2 x i8> %h.load, + ; CHECK: st.volatile.global.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} + store volatile<2 x i8> %h.add, ptr addrspace(1) %b + + ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %i.load = load volatile <4 x i8>, ptr addrspace(1) %c + %i.add = add <4 x i8> %i.load, + ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile<4 x i8> %i.add, ptr addrspace(1) %c + + ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %j.load = load volatile <2 x i16>, ptr addrspace(1) %c + %j.add = add <2 x i16> %j.load, + ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile<2 x i16> %j.add, ptr addrspace(1) %c + + ; CHECK: ld.volatile.global.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %k.load = load volatile <4 x i16>, ptr addrspace(1) %d + %k.add = add <4 x i16> %k.load, + ; CHECK: st.volatile.global.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} + store volatile<4 x i16> %k.add, ptr addrspace(1) %d + + ; CHECK: ld.volatile.global.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %l.load = load volatile <2 x i32>, ptr addrspace(1) %d + %l.add = add <2 x i32> %l.load, + ; CHECK: st.volatile.global.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} + store volatile<2 x i32> %l.add, ptr addrspace(1) %d + + ; CHECK: ld.volatile.global.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %m.load = load volatile <4 x i32>, ptr addrspace(1) %d + %m.add = add <4 x i32> %m.load, + ; CHECK: st.volatile.global.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} + store volatile<4 x i32> %m.add, ptr addrspace(1) %d + + ; CHECK: ld.volatile.global.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %n.load = load volatile <2 x i64>, ptr addrspace(1) %d + %n.add = add <2 x i64> %n.load, + ; CHECK: st.volatile.global.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} + store volatile<2 x i64> %n.add, ptr addrspace(1) %d + + ; CHECK: ld.volatile.global.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %o.load = load volatile <2 x float>, ptr addrspace(1) %d + %o.add = fadd <2 x float> %o.load, + ; CHECK: st.volatile.global.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} + store volatile<2 x float> %o.add, ptr addrspace(1) %d + + ; CHECK: ld.volatile.global.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %p.load = load volatile <4 x float>, ptr addrspace(1) %d + %p.add = fadd <4 x float> %p.load, + ; CHECK: st.volatile.global.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} + store volatile<4 x float> %p.add, ptr addrspace(1) %d + + ; CHECK: ld.volatile.global.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %q.load = load volatile <2 x double>, ptr addrspace(1) %d + %q.add = fadd <2 x double> %q.load, + ; CHECK: st.volatile.global.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} + store volatile<2 x double> %q.add, ptr addrspace(1) %d + ret void } ; CHECK-LABEL: global_monotonic define void @global_monotonic(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(1) %a monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(1) %a monotonic, align 1 - ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(1) %b monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(1) %b monotonic, align 2 - ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(1) %c monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM70: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(1) %c monotonic, align 4 - ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(1) %d monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM70: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(1) %d monotonic, align 8 - ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(1) %e monotonic, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM70: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(1) %e monotonic, align 4 - ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(1) %e monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM70: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(1) %e monotonic, align 8 ret void @@ -376,40 +685,52 @@ define void @global_monotonic(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addr ; CHECK-LABEL: global_monotonic_volatile define void @global_monotonic_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(1) %a monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(1) %a monotonic, align 1 - ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(1) %b monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(1) %b monotonic, align 2 - ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(1) %c monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM70: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(1) %c monotonic, align 4 - ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(1) %d monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM70: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(1) %d monotonic, align 8 - ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(1) %e monotonic, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM70: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(1) %e monotonic, align 4 - ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(1) %e monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM70: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(1) %e monotonic, align 8 ret void @@ -417,40 +738,52 @@ define void @global_monotonic_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ; CHECK-LABEL: global_unordered define void @global_unordered(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(1) %a unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(1) %a unordered, align 1 - ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(1) %b unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(1) %b unordered, align 2 - ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(1) %c unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM70: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(1) %c unordered, align 4 - ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(1) %d unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM70: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(1) %d unordered, align 8 - ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(1) %e unordered, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM70: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(1) %e unordered, align 4 - ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(1) %e unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM70: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(1) %e unordered, align 8 ret void @@ -458,40 +791,52 @@ define void @global_unordered(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addr ; CHECK-LABEL: global_unordered_volatile define void @global_unordered_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(1) %a unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(1) %a unordered, align 1 - ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(1) %b unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(1) %b unordered, align 2 - ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(1) %c unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM70: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(1) %c unordered, align 4 - ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(1) %d unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM70: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(1) %d unordered, align 8 - ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(1) %e unordered, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM70: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(1) %e unordered, align 4 - ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(1) %e unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM70: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(1) %e unordered, align 8 ret void @@ -537,6 +882,66 @@ define void @shared_plain(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspac ; CHECK: st.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store double %f.add, ptr addrspace(3) %c + ; CHECK: ld.shared.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %h.load = load <2 x i8>, ptr addrspace(3) %b + %h.add = add <2 x i8> %h.load, + ; CHECK: st.shared.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} + store <2 x i8> %h.add, ptr addrspace(3) %b + + ; CHECK: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %i.load = load <4 x i8>, ptr addrspace(3) %c + %i.add = add <4 x i8> %i.load, + ; CHECK: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store <4 x i8> %i.add, ptr addrspace(3) %c + + ; CHECK: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %j.load = load <2 x i16>, ptr addrspace(3) %c + %j.add = add <2 x i16> %j.load, + ; CHECK: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store <2 x i16> %j.add, ptr addrspace(3) %c + + ; CHECK: ld.shared.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %k.load = load <4 x i16>, ptr addrspace(3) %d + %k.add = add <4 x i16> %k.load, + ; CHECK: st.shared.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} + store <4 x i16> %k.add, ptr addrspace(3) %d + + ; CHECK: ld.shared.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %l.load = load <2 x i32>, ptr addrspace(3) %d + %l.add = add <2 x i32> %l.load, + ; CHECK: st.shared.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} + store <2 x i32> %l.add, ptr addrspace(3) %d + + ; CHECK: ld.shared.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %m.load = load <4 x i32>, ptr addrspace(3) %d + %m.add = add <4 x i32> %m.load, + ; CHECK: st.shared.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} + store <4 x i32> %m.add, ptr addrspace(3) %d + + ; CHECK: ld.shared.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %n.load = load <2 x i64>, ptr addrspace(3) %d + %n.add = add <2 x i64> %n.load, + ; CHECK: st.shared.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} + store <2 x i64> %n.add, ptr addrspace(3) %d + + ; CHECK: ld.shared.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %o.load = load <2 x float>, ptr addrspace(3) %d + %o.add = fadd <2 x float> %o.load, + ; CHECK: st.shared.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} + store <2 x float> %o.add, ptr addrspace(3) %d + + ; CHECK: ld.shared.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %p.load = load <4 x float>, ptr addrspace(3) %d + %p.add = fadd <4 x float> %p.load, + ; CHECK: st.shared.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} + store <4 x float> %p.add, ptr addrspace(3) %d + + ; CHECK: ld.shared.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %q.load = load <2 x double>, ptr addrspace(3) %d + %q.add = fadd <2 x double> %q.load, + ; CHECK: st.shared.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} + store <2 x double> %q.add, ptr addrspace(3) %d + ret void } @@ -578,45 +983,119 @@ define void @shared_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrs ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store volatile double %f.add, ptr addrspace(3) %c + ; CHECK: ld.volatile.shared.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %h.load = load volatile <2 x i8>, ptr addrspace(3) %b + %h.add = add <2 x i8> %h.load, + ; CHECK: st.volatile.shared.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} + store volatile <2 x i8> %h.add, ptr addrspace(3) %b + + ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %i.load = load volatile <4 x i8>, ptr addrspace(3) %c + %i.add = add <4 x i8> %i.load, + ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile <4 x i8> %i.add, ptr addrspace(3) %c + + ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %j.load = load volatile <2 x i16>, ptr addrspace(3) %c + %j.add = add <2 x i16> %j.load, + ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile <2 x i16> %j.add, ptr addrspace(3) %c + + ; CHECK: ld.volatile.shared.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %k.load = load volatile <4 x i16>, ptr addrspace(3) %d + %k.add = add <4 x i16> %k.load, + ; CHECK: st.volatile.shared.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} + store volatile <4 x i16> %k.add, ptr addrspace(3) %d + + ; CHECK: ld.volatile.shared.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %l.load = load volatile <2 x i32>, ptr addrspace(3) %d + %l.add = add <2 x i32> %l.load, + ; CHECK: st.volatile.shared.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} + store volatile <2 x i32> %l.add, ptr addrspace(3) %d + + ; CHECK: ld.volatile.shared.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %m.load = load volatile <4 x i32>, ptr addrspace(3) %d + %m.add = add <4 x i32> %m.load, + ; CHECK: st.volatile.shared.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} + store volatile <4 x i32> %m.add, ptr addrspace(3) %d + + ; CHECK: ld.volatile.shared.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %n.load = load volatile <2 x i64>, ptr addrspace(3) %d + %n.add = add <2 x i64> %n.load, + ; CHECK: st.volatile.shared.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} + store volatile <2 x i64> %n.add, ptr addrspace(3) %d + + ; CHECK: ld.volatile.shared.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %o.load = load volatile <2 x float>, ptr addrspace(3) %d + %o.add = fadd <2 x float> %o.load, + ; CHECK: st.volatile.shared.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} + store volatile <2 x float> %o.add, ptr addrspace(3) %d + + ; CHECK: ld.volatile.shared.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %p.load = load volatile <4 x float>, ptr addrspace(3) %d + %p.add = fadd <4 x float> %p.load, + ; CHECK: st.volatile.shared.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} + store volatile <4 x float> %p.add, ptr addrspace(3) %d + + ; CHECK: ld.volatile.shared.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %q.load = load volatile <2 x double>, ptr addrspace(3) %d + %q.add = fadd <2 x double> %q.load, + ; CHECK: st.volatile.shared.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} + store volatile <2 x double> %q.add, ptr addrspace(3) %d + ret void } ; CHECK-LABEL: shared_monotonic define void @shared_monotonic(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; TODO: optimize .sys.shared to .cta.shared or .cluster.shared. + + ; SM60: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(3) %a monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(3) %a monotonic, align 1 - ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(3) %b monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(3) %b monotonic, align 2 - ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(3) %c monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM60: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM70: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(3) %c monotonic, align 4 - ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(3) %d monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM60: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM70: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(3) %d monotonic, align 8 - ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(3) %e monotonic, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM60: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM70: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(3) %e monotonic, align 4 - ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(3) %e monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM60: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM70: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(3) %e monotonic, align 8 ret void @@ -665,40 +1144,54 @@ define void @shared_monotonic_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ; CHECK-LABEL: shared_unordered define void @shared_unordered(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; TODO: optimize .sys.shared to .cta.shared or .cluster.shared. + + ; SM60: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(3) %a unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(3) %a unordered, align 1 - ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(3) %b unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(3) %b unordered, align 2 - ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(3) %c unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM60: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM70: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(3) %c unordered, align 4 - ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(3) %d unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM60: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM70: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(3) %d unordered, align 8 - ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(3) %e unordered, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM60: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM70: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(3) %e unordered, align 4 - ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(3) %e unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM60: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM70: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(3) %e unordered, align 8 ret void @@ -785,11 +1278,74 @@ define void @local_plain(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store double %f.add, ptr addrspace(5) %c + ; CHECK: ld.local.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %h.load = load <2 x i8>, ptr addrspace(5) %b + %h.add = add <2 x i8> %h.load, + ; CHECK: st.local.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} + store <2 x i8> %h.add, ptr addrspace(5) %b + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %i.load = load <4 x i8>, ptr addrspace(5) %c + %i.add = add <4 x i8> %i.load, + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store <4 x i8> %i.add, ptr addrspace(5) %c + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %j.load = load <2 x i16>, ptr addrspace(5) %c + %j.add = add <2 x i16> %j.load, + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store <2 x i16> %j.add, ptr addrspace(5) %c + + ; CHECK: ld.local.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %k.load = load <4 x i16>, ptr addrspace(5) %d + %k.add = add <4 x i16> %k.load, + ; CHECK: st.local.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} + store <4 x i16> %k.add, ptr addrspace(5) %d + + ; CHECK: ld.local.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %l.load = load <2 x i32>, ptr addrspace(5) %d + %l.add = add <2 x i32> %l.load, + ; CHECK: st.local.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} + store <2 x i32> %l.add, ptr addrspace(5) %d + + ; CHECK: ld.local.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %m.load = load <4 x i32>, ptr addrspace(5) %d + %m.add = add <4 x i32> %m.load, + ; CHECK: st.local.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} + store <4 x i32> %m.add, ptr addrspace(5) %d + + ; CHECK: ld.local.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %n.load = load <2 x i64>, ptr addrspace(5) %d + %n.add = add <2 x i64> %n.load, + ; CHECK: st.local.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} + store <2 x i64> %n.add, ptr addrspace(5) %d + + ; CHECK: ld.local.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %o.load = load <2 x float>, ptr addrspace(5) %d + %o.add = fadd <2 x float> %o.load, + ; CHECK: st.local.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} + store <2 x float> %o.add, ptr addrspace(5) %d + + ; CHECK: ld.local.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %p.load = load <4 x float>, ptr addrspace(5) %d + %p.add = fadd <4 x float> %p.load, + ; CHECK: st.local.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} + store <4 x float> %p.add, ptr addrspace(5) %d + + ; CHECK: ld.local.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %q.load = load <2 x double>, ptr addrspace(5) %d + %q.add = fadd <2 x double> %q.load, + ; CHECK: st.local.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} + store <2 x double> %q.add, ptr addrspace(5) %d + ret void } ; CHECK-LABEL: local_volatile define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr { + ; TODO: generate PTX that preserves Concurrent Forward Progress + ; by using volatile operations. + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load volatile i8, ptr addrspace(5) %a %a.add = add i8 %a.load, 1 @@ -826,11 +1382,74 @@ define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrsp ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store volatile double %f.add, ptr addrspace(5) %c + ; CHECK: ld.local.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %h.load = load volatile <2 x i8>, ptr addrspace(5) %b + %h.add = add <2 x i8> %h.load, + ; CHECK: st.local.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} + store volatile <2 x i8> %h.add, ptr addrspace(5) %b + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %i.load = load volatile <4 x i8>, ptr addrspace(5) %c + %i.add = add <4 x i8> %i.load, + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile <4 x i8> %i.add, ptr addrspace(5) %c + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %j.load = load volatile <2 x i16>, ptr addrspace(5) %c + %j.add = add <2 x i16> %j.load, + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile <2 x i16> %j.add, ptr addrspace(5) %c + + ; CHECK: ld.local.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %k.load = load volatile <4 x i16>, ptr addrspace(5) %d + %k.add = add <4 x i16> %k.load, + ; CHECK: st.local.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} + store volatile <4 x i16> %k.add, ptr addrspace(5) %d + + ; CHECK: ld.local.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %l.load = load volatile <2 x i32>, ptr addrspace(5) %d + %l.add = add <2 x i32> %l.load, + ; CHECK: st.local.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} + store volatile <2 x i32> %l.add, ptr addrspace(5) %d + + ; CHECK: ld.local.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %m.load = load volatile <4 x i32>, ptr addrspace(5) %d + %m.add = add <4 x i32> %m.load, + ; CHECK: st.local.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} + store volatile <4 x i32> %m.add, ptr addrspace(5) %d + + ; CHECK: ld.local.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %n.load = load volatile <2 x i64>, ptr addrspace(5) %d + %n.add = add <2 x i64> %n.load, + ; CHECK: st.local.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} + store volatile <2 x i64> %n.add, ptr addrspace(5) %d + + ; CHECK: ld.local.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %o.load = load volatile <2 x float>, ptr addrspace(5) %d + %o.add = fadd <2 x float> %o.load, + ; CHECK: st.local.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} + store volatile <2 x float> %o.add, ptr addrspace(5) %d + + ; CHECK: ld.local.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %p.load = load volatile <4 x float>, ptr addrspace(5) %d + %p.add = fadd <4 x float> %p.load, + ; CHECK: st.local.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} + store volatile <4 x float> %p.add, ptr addrspace(5) %d + + ; CHECK: ld.local.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %q.load = load volatile <2 x double>, ptr addrspace(5) %d + %q.add = fadd <2 x double> %q.load, + ; CHECK: st.local.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} + store volatile <2 x double> %q.add, ptr addrspace(5) %d + ret void } ; CHECK-LABEL: local_monotonic define void @local_monotonic(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; TODO: generate PTX that preserves Concurrent Forward Progress + ; by using PTX atomic operations. + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1 %a.add = add i8 %a.load, 1 @@ -872,6 +1491,9 @@ define void @local_monotonic(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrs ; CHECK-LABEL: local_monotonic_volatile define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; TODO: generate PTX that preserves Concurrent Forward Progress + ; by generating atomic or volatile operations + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1 %a.add = add i8 %a.load, 1 @@ -992,3 +1614,6 @@ define void @local_unordered_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ret void } + +; TODO: add plain,atomic,volatile,atomic volatile tests +; for .const and .param statespaces \ No newline at end of file