[NVPTX] Add Volta Atomic SequentiallyConsistent Load and Store Operations #98551

gonzalobg · 2024-07-11T21:10:12Z

This PR Builds on #98022 .
It adds support for Volta's SequentiallyConsistent Load and Store operations at system scope.

llvmbot · 2024-07-11T21:10:40Z

@llvm/pr-subscribers-backend-nvptx

Author: None (gonzalobg)

Changes

This PR Builds on #98022 .
In this PR, the fence.sc.sys is inserted as part of the load/store instruction.
The next PR will refactor a few things, add fence instruction support, and split this out.

Patch is 119.81 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/98551.diff

7 Files Affected:

(modified) llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp (+40-2)
(modified) llvm/lib/Target/NVPTX/NVPTX.h (+47-1)
(modified) llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp (+268-76)
(modified) llvm/lib/Target/NVPTX/NVPTXInstrInfo.td (+72-72)
(modified) llvm/lib/Target/NVPTX/NVPTXSubtarget.h (+33-20)
(added) llvm/test/CodeGen/NVPTX/load-store-sm-70.ll (+1006)
(modified) llvm/test/CodeGen/NVPTX/load-store.ll (+577-6)

diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index b7a20c351f5ff..9f1e56cd84e51 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -224,9 +224,47 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
   if (Modifier) {
     const MCOperand &MO = MI->getOperand(OpNum);
     int Imm = (int) MO.getImm();
-    if (!strcmp(Modifier, "volatile")) {
-      if (Imm)
+    if (!strcmp(Modifier, "sem")) {
+      auto Ordering =
+          NVPTX::Ordering(static_cast<NVPTX::OrderingUnderlyingType>(Imm));
+      switch (Ordering) {
+      case NVPTX::Ordering::NotAtomic:
+        break;
+      case NVPTX::Ordering::Volatile:
         O << ".volatile";
+        break;
+      case NVPTX::Ordering::Relaxed:
+        O << ".relaxed.sys";
+        break;
+      case NVPTX::Ordering::Acquire:
+        O << ".acquire.sys";
+        break;
+      case NVPTX::Ordering::Release:
+        O << ".release.sys";
+        break;
+      case NVPTX::Ordering::RelaxedMMIO:
+        O << ".mmio.relaxed.sys";
+        break;
+      default:
+        SmallString<256> Msg;
+        raw_svector_ostream OS(Msg);
+        OS << "NVPTX LdStCode Printer does not support \"" << Ordering
+           << "\" sem modifier.";
+        report_fatal_error(OS.str());
+        break;
+      }
+    } else if (!strcmp(Modifier, "sc")) {
+      auto Ordering =
+          NVPTX::Ordering(static_cast<NVPTX::OrderingUnderlyingType>(Imm));
+      switch (Ordering) {
+        // TODO: refactor fence insertion in ISelDagToDag instead of here
+        // as part of implementing atomicrmw seq_cst.
+      case NVPTX::Ordering::SequentiallyConsistent:
+        O << "fence.sc.sys;\n\t";
+        break;
+      default:
+        break;
+      }
     } else if (!strcmp(Modifier, "addsp")) {
       switch (Imm) {
       case NVPTX::PTXLdStInstCode::GLOBAL:
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index b0cb24c63c3ce..c9cce23788ca4 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -106,6 +106,52 @@ enum LoadStore {
   isStoreShift = 6
 };
 
+// Extends LLVM AtomicOrdering with PTX Orderings:
+using OrderingUnderlyingType = unsigned int;
+enum class Ordering : OrderingUnderlyingType {
+  NotAtomic = 0, // PTX calls these: "Weak"
+  // Unordered = 1, // TODO: NVPTX should map this to "Relaxed"
+  Relaxed = 2,
+  // Consume = 3,   // Unimplemented in LLVM; NVPTX would map to "Acquire"
+  Acquire = 4,
+  Release = 5,
+  // AcquireRelease = 6, // TODO
+  SequentiallyConsistent = 7,
+  Volatile = 8,
+  RelaxedMMIO = 9,
+  LAST = RelaxedMMIO
+};
+
+template <typename OStream> OStream &operator<<(OStream &O, Ordering Order) {
+  switch (Order) {
+  case Ordering::NotAtomic:
+    O << "NotAtomic";
+    return O;
+  case Ordering::Relaxed:
+    O << "Relaxed";
+    return O;
+  case Ordering::Acquire:
+    O << "Acquire";
+    return O;
+  case Ordering::Release:
+    O << "Release";
+    return O;
+  // case Ordering::AcquireRelease:
+  //   O << "AcquireRelease";
+  //   return O;
+  case Ordering::SequentiallyConsistent:
+    O << "SequentiallyConsistent";
+    return O;
+  case Ordering::Volatile:
+    O << "Volatile";
+    return O;
+  case Ordering::RelaxedMMIO:
+    O << "RelaxedMMIO";
+    return O;
+  }
+  report_fatal_error("unknown ordering");
+}
+
 namespace PTXLdStInstCode {
 enum AddressSpace {
   GENERIC = 0,
@@ -126,7 +172,7 @@ enum VecType {
   V2 = 2,
   V4 = 4
 };
-}
+} // namespace PTXLdStInstCode
 
 /// PTXCvtMode - Conversion code enumeration
 namespace PTXCvtMode {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 1e1cbb15e33d4..41a3c2102427f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -700,6 +700,183 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) {
   return NVPTX::PTXLdStInstCode::GENERIC;
 }
 
+struct OperationOrderings {
+  NVPTX::OrderingUnderlyingType InstrOrdering;
+  NVPTX::OrderingUnderlyingType FenceOrdering;
+  OperationOrderings(NVPTX::Ordering o = NVPTX::Ordering::NotAtomic,
+                     NVPTX::Ordering f = NVPTX::Ordering::NotAtomic)
+      : InstrOrdering(static_cast<NVPTX::OrderingUnderlyingType>(o)),
+        FenceOrdering(static_cast<NVPTX::OrderingUnderlyingType>(f)) {}
+};
+
+static OperationOrderings
+getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
+  AtomicOrdering Ordering = N->getSuccessOrdering();
+  auto CodeAddrSpace = getCodeAddrSpace(N);
+
+  bool HasMemoryOrdering = Subtarget->hasMemoryOrdering();
+  bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO();
+
+  // clang-format off
+
+  // Lowering for Load/Store Operations (note: AcquireRelease Loads or Stores error).
+  //
+  // | Atomic  | Volatile | Statespace                    | Lowering sm_60- | Lowering sm_70+                                      |
+  // |---------|----------|-------------------------------|-----------------|------------------------------------------------------|
+  // | No      | No       | All                           | plain           | .weak                                                |
+  // | No      | Yes      | Generic / Shared / Global [0] | .volatile       | .volatile                                            |
+  // | No      | Yes      | Local / Const / Param         | plain [1]       | .weak [1]                                            |
+  // | Relaxed | No       | Generic / Shared / Global [0] | .volatile       | <atomic sem>                                         |
+  // | Other   | No       | Generic / Shared / Global [0] | Error [2]       | <atomic sem>                                         |
+  // | Yes     | No       | Local / Const / Param         | plain [1]       | .weak [1]                                            |
+  // | Relaxed | Yes      | Generic / Shared [0]          | .volatile       | .volatile                                            |
+  // | Relaxed | Yes      | Global [0]                    | .volatile       | .mmio.relaxed.sys (PTX 8.2+) or .volatile (PTX 8.1-) |
+  // | Relaxed | Yes      | Local / Const / Param         | plain [1]       | .weak [1]                                            |
+  // | Other   | Yes      | Generic / Shared / Global [0] | Error [2]       | <atomic sem> [3]                                     |
+
+  // Lowering of CUDA C++ SequentiallyConsistent Operations and Fences to PTX
+  // by following the ABI proven sound in:
+  //   Lustig et al, A Formal Analysis of the NVIDIA PTX Memory Consistency Model, ASPLOS’19.
+  //   https://dl.acm.org/doi/pdf/10.1145/3297858.3304043
+  //
+  // | CUDA C++ Atomic Operation or Atomic Fence                                   | PTX Atomic Operation or Fence           |
+  // |-----------------------------------------------------------------------------|-----------------------------------------|
+  // | cuda::atomic_thread_fence(memory_order_seq_cst, cuda::thread_scope_<scope>) | fence.sc.<scope>;                       |
+  // | cuda::atomic_load(memory_order_seq_cst, cuda::thread_scope_<scope>)         | fence.sc.<scope>; ld.acquire.<scope>;   |
+  // | cuda::atomic_store(memory_order_seq_cst, cuda::thread_scope_<scope>)        | fence.sc.<scope>; st.release.<scope>;   |  
+  // | cuda::atomic_fetch_<op>(memory_order_seq_cst, cuda::thread_scope_<scope>)   | fence.sc.<scope>; atom.acq_rel.<scope>; |
+
+  // clang-format on
+
+  // [0]: volatile and atomics are only supported on generic addressing to
+  //      shared or global, or shared, or global.
+  //      MMIO requires generic addressing to global or global, but
+  // (TODO) we only implement it for global.
+
+  // [1]: TODO: this implementation exhibits PTX Undefined Behavior; it
+  //      fails to preserve the side-effects of atomics and volatile
+  //      accesses in LLVM IR to local / const / param, causing
+  //      well-formed LLVM-IR & CUDA C++ programs to be miscompiled
+  //      in sm_70+.
+
+  if (CodeAddrSpace == NVPTX::PTXLdStInstCode::LOCAL ||
+      CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT ||
+      CodeAddrSpace == NVPTX::PTXLdStInstCode::PARAM) {
+    return NVPTX::Ordering::NotAtomic;
+  }
+
+  // [2]: Atomics with Ordering different than Relaxed are not supported on
+  //      sm_60 and older; this includes volatile atomics.
+  if (!(Ordering == AtomicOrdering::NotAtomic ||
+        Ordering == AtomicOrdering::Monotonic) &&
+      !HasMemoryOrdering) {
+    SmallString<256> Msg;
+    raw_svector_ostream OS(Msg);
+    OS << "PTX does not support \"atomic\" for orderings different than"
+          "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order is: \""
+       << toIRString(Ordering) << "\".";
+    report_fatal_error(OS.str());
+  }
+
+  // [3]: TODO: these should eventually use .mmio<.atomic sem>; for now we drop
+  // the volatile semantics and preserve the atomic ones.
+
+  // PTX volatile and PTX atomics are not available for statespace that differ
+  // from .generic, .global, or .shared. The behavior of PTX volatile and PTX
+  // atomics is undefined if the generic address does not refer to a .global or
+  // .shared memory location.
+  bool AddrGenericOrGlobalOrShared =
+      (CodeAddrSpace == NVPTX::PTXLdStInstCode::GENERIC ||
+       CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL ||
+       CodeAddrSpace == NVPTX::PTXLdStInstCode::SHARED);
+  bool UseRelaxedMMIO =
+      HasRelaxedMMIO && CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL;
+
+  switch (Ordering) {
+  case AtomicOrdering::NotAtomic:
+    return N->isVolatile() && AddrGenericOrGlobalOrShared
+               ? NVPTX::Ordering::Volatile
+               : NVPTX::Ordering::NotAtomic;
+  case AtomicOrdering::Monotonic:
+    if (N->isVolatile())
+      return UseRelaxedMMIO                ? NVPTX::Ordering::RelaxedMMIO
+             : AddrGenericOrGlobalOrShared ? NVPTX::Ordering::Volatile
+                                           : NVPTX::Ordering::NotAtomic;
+    else
+      return HasMemoryOrdering             ? NVPTX::Ordering::Relaxed
+             : AddrGenericOrGlobalOrShared ? NVPTX::Ordering::Volatile
+                                           : NVPTX::Ordering::NotAtomic;
+  // case AtomicOrdering::Consume: // If LLVM ever provides this, lower it to
+  // Acquire.
+  case AtomicOrdering::Acquire:
+    if (!N->readMem()) {
+      SmallString<256> Msg;
+      raw_svector_ostream OS(Msg);
+      OS << "PTX only supports Acquire Ordering on reads: "
+         << N->getOperationName();
+      N->print(OS);
+      report_fatal_error(OS.str());
+    }
+    return AddrGenericOrGlobalOrShared ? NVPTX::Ordering::Acquire
+                                       : NVPTX::Ordering::NotAtomic;
+  case AtomicOrdering::Release:
+    if (!N->writeMem()) {
+      SmallString<256> Msg;
+      raw_svector_ostream OS(Msg);
+      OS << "PTX only supports Release Ordering on writes: "
+         << N->getOperationName();
+      N->print(OS);
+      report_fatal_error(OS.str());
+    }
+    return AddrGenericOrGlobalOrShared ? NVPTX::Ordering::Release
+                                       : NVPTX::Ordering::NotAtomic;
+  case AtomicOrdering::AcquireRelease: {
+    SmallString<256> Msg;
+    raw_svector_ostream OS(Msg);
+    OS << "NVPTX does not support AcquireRelease Ordering on read-modify-write "
+          "yet and PTX does not support it on loads or stores: "
+       << N->getOperationName();
+    N->print(OS);
+    report_fatal_error(OS.str());
+  }
+  case AtomicOrdering::SequentiallyConsistent: {
+    // LLVM-IR SequentiallyConsistent atomics map to a two-instruction PTX
+    // sequence including a "fence.sc.sco" and the memory instruction with an
+    // Ordering that differs from "sc": acq, rel, or acq_rel, depending on
+    // whether the memory operation is a read, write, or read-modify-write.
+    //
+    // This sets the ordering of the fence to SequentiallyConsistent, and
+    // sets the corresponding ordering for the instruction.
+    NVPTX::Ordering InstrOrder;
+    if (N->readMem()) {
+      InstrOrder = NVPTX::Ordering::Acquire;
+    } else if (N->writeMem()) {
+      InstrOrder = NVPTX::Ordering::Release;
+    } else {
+      SmallString<256> Msg;
+      raw_svector_ostream OS(Msg);
+      OS << "NVPTX does not support SequentiallyConsistent Ordering on "
+            "read-modify-writes yet: "
+         << N->getOperationName();
+      N->print(OS);
+      report_fatal_error(OS.str());
+    }
+    return AddrGenericOrGlobalOrShared
+               ? OperationOrderings(InstrOrder,
+                                    NVPTX::Ordering::SequentiallyConsistent)
+               : OperationOrderings(NVPTX::Ordering::NotAtomic);
+  }
+  case AtomicOrdering::Unordered:
+    break;
+  }
+
+  SmallString<256> Msg;
+  raw_svector_ostream OS(Msg);
+  OS << "NVPTX backend does not support AtomicOrdering \""
+     << toIRString(Ordering) << "\" yet.";
+  report_fatal_error(OS.str());
+}
+
 static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
                           unsigned CodeAddrSpace, MachineFunction *F) {
   // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
@@ -902,32 +1079,19 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
   if (!LoadedVT.isSimple())
     return false;
 
-  AtomicOrdering Ordering = LD->getSuccessOrdering();
-  // In order to lower atomic loads with stronger guarantees we would need to
-  // use load.acquire or insert fences. However these features were only added
-  // with PTX ISA 6.0 / sm_70.
-  // TODO: Check if we can actually use the new instructions and implement them.
-  if (isStrongerThanMonotonic(Ordering))
-    return false;
-
   // Address Space Setting
   unsigned int CodeAddrSpace = getCodeAddrSpace(LD);
   if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
     return tryLDGLDU(N);
   }
 
+  // Memory Semantic Setting
+  auto [InstructionOrdering, FenceOrdering] =
+      getOperationOrderings(LD, Subtarget);
+
   unsigned int PointerSize =
       CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
 
-  // Volatile Setting
-  // - .volatile is only available for .global and .shared
-  // - .volatile has the same memory synchronization semantics as .relaxed.sys
-  bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic;
-  if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
-      CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
-      CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
-    isVolatile = false;
-
   // Type Setting: fromType + fromTypeWidth
   //
   // Sign   : ISD::SEXTLOAD
@@ -968,9 +1132,14 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
                              NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
     if (!Opcode)
       return false;
-    SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
-                      getI32Imm(vecType, dl), getI32Imm(fromType, dl),
-                      getI32Imm(fromTypeWidth, dl), Addr, Chain };
+    SDValue Ops[] = {getI32Imm(FenceOrdering, dl),
+                     getI32Imm(InstructionOrdering, dl),
+                     getI32Imm(CodeAddrSpace, dl),
+                     getI32Imm(vecType, dl),
+                     getI32Imm(fromType, dl),
+                     getI32Imm(fromTypeWidth, dl),
+                     Addr,
+                     Chain};
     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
   } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
                                : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
@@ -979,9 +1148,15 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
                              NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
     if (!Opcode)
       return false;
-    SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
-                      getI32Imm(vecType, dl), getI32Imm(fromType, dl),
-                      getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
+    SDValue Ops[] = {getI32Imm(FenceOrdering, dl),
+                     getI32Imm(InstructionOrdering, dl),
+                     getI32Imm(CodeAddrSpace, dl),
+                     getI32Imm(vecType, dl),
+                     getI32Imm(fromType, dl),
+                     getI32Imm(fromTypeWidth, dl),
+                     Base,
+                     Offset,
+                     Chain};
     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
   } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
                                : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
@@ -996,9 +1171,15 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
                                NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
     if (!Opcode)
       return false;
-    SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
-                      getI32Imm(vecType, dl), getI32Imm(fromType, dl),
-                      getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
+    SDValue Ops[] = {getI32Imm(FenceOrdering, dl),
+                     getI32Imm(InstructionOrdering, dl),
+                     getI32Imm(CodeAddrSpace, dl),
+                     getI32Imm(vecType, dl),
+                     getI32Imm(fromType, dl),
+                     getI32Imm(fromTypeWidth, dl),
+                     Base,
+                     Offset,
+                     Chain};
     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
   } else {
     if (PointerSize == 64)
@@ -1012,9 +1193,14 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
                                NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
     if (!Opcode)
       return false;
-    SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
-                      getI32Imm(vecType, dl), getI32Imm(fromType, dl),
-                      getI32Imm(fromTypeWidth, dl), N1, Chain };
+    SDValue Ops[] = {getI32Imm(FenceOrdering, dl),
+                     getI32Imm(InstructionOrdering, dl),
+                     getI32Imm(CodeAddrSpace, dl),
+                     getI32Imm(vecType, dl),
+                     getI32Imm(fromType, dl),
+                     getI32Imm(fromTypeWidth, dl),
+                     N1,
+                     Chain};
     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
   }
 
@@ -1051,13 +1237,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
   unsigned int PointerSize =
       CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
 
-  // Volatile Setting
-  // - .volatile is only availalble for .global and .shared
-  bool IsVolatile = MemSD->isVolatile();
-  if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
-      CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
-      CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
-    IsVolatile = false;
+  // Memory Semantic Setting
+  auto [InstructionOrdering, FenceOrdering] =
+      getOperationOrderings(MemSD, Subtarget);
 
   // Vector Setting
   MVT SimpleVT = LoadedVT.getSimpleVT();
@@ -1124,9 +1306,14 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
     }
     if (!Opcode)
       return false;
-    SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
-                      getI32Imm(VecType, DL), getI32Imm(FromType, DL),
-                      getI32Imm(FromTypeWidth, DL), Addr, Chain };
+    SDValue Ops[] = {getI32Imm(FenceOrdering, DL),
+                     getI32Imm(InstructionOrdering, DL),
+                     getI32Imm(CodeAddrSpace, DL),
+                     getI32Imm(VecType, DL),
+                     getI32Imm(FromType, DL),
+                     getI32Imm(FromTypeWidth, DL),
+                     Addr,
+                     Chain};
     LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
   } else if (PointerSize == 64
                  ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
@@ -1149,9 +1336,15 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
     }
     if (!Opcode)
       return false;
-    SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
-                      getI32Imm(VecType, DL), getI32Imm(FromType, DL),...
[truncated]

gonzalobg · 2024-07-11T21:22:59Z

@AlexMaclean @Artem-B @jlebar hello all three, I'm looking for some reviewers for this series of PRs to improve the support of atomics in LLVM NVPTX backend. It's my first time touching these parts of the LLVM project, but everyone in the LLVM Discord has been super nice helping me get onboarded.

This PR includes all the changes of #96436 and #98022, which haven't really gotten any replies yet.. I just kept building on them. Not sure whether I should close them and continue building on them, or whether it is easier to review them piece meal.

I'm happy to help review other people's PR and help with the NVPTX backend in general. I'm "gonzalob" in the LLVM Discord in case anyone wants to chat :)

Artem-B · 2024-07-11T22:36:09Z

Sorry, I was out of office for a while. I'm catching up on the pending reviews now. Stay tuned.

gonzalobg · 2024-07-19T14:27:33Z

Have rebased this on top of #99649 . This is now a simpler change on top of that one.

gonzalobg · 2024-07-25T21:37:10Z

Have rebased on top of the merged PRs, and extended the unordered tests in the same way this PR was extending the other tests.

Artem-B

Looks OK overall, with some minor comments.

I'll be out till Tuesday, and will not be able to do the next review round until later next week.

llvm/lib/Target/NVPTX/NVPTX.h

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

llvm/test/CodeGen/NVPTX/load-store-sm-70.ll

llvm/test/CodeGen/NVPTX/load-store.ll

- Remove dead code in InstPrinter - Capitalization and improve Table comments - Update fence->membar lowering

… collisions

llvm/lib/Target/NVPTX/NVPTX.h

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

Artem-B · 2024-07-31T17:37:55Z

llvm/lib/Target/NVPTX/NVPTX.h

-  Release = 4,
-  RelaxedMMIO = 5
+  // Consume = 3,   // Unimplemented in LLVM; NVPTX would map to "Acquire"
+  Acquire = 4,


Do we need specific enum values? If not, drop them.

I don't think we really really need them, but in the DAG they are just (i32 x) nodes where x is one of these values, and the values picked here match the ones of LLVM AtomicOrdering for the cases in which both expose the same ordering, which makes it easy to interpret those DAG nodes.

OK. The only problem is that hardcoded numbers may eventually deviate from the values used by AtomicOrdering.
Perhaps instead of the literal values, we should use AtomicOrdering enum values, possibly with a set of static asserts ensuring that the values in this enum are strictly ordered, if we rely on that.

Good idea with the static_asserts, that would prevent this from breaking.
Since we have some orderings that extend AtomicOrdering and we have to set LAST, etc., I've left the integer values though. If a static_assert fail, we'll need to update this properly.

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

llvm/test/CodeGen/NVPTX/load-store.ll

Artem-B

Few more minor nits, and I think we should be good to go.

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

llvm/lib/Target/NVPTX/NVPTXUtilities.h

Artem-B · 2024-07-31T22:59:06Z

llvm/lib/Target/NVPTX/NVPTX.h

-  Release = 4,
-  RelaxedMMIO = 5
+  // Consume = 3,   // Unimplemented in LLVM; NVPTX would map to "Acquire"
+  Acquire = 4,


OK. The only problem is that hardcoded numbers may eventually deviate from the values used by AtomicOrdering.
Perhaps instead of the literal values, we should use AtomicOrdering enum values, possibly with a set of static asserts ensuring that the values in this enum are strictly ordered, if we rely on that.

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

github-actions · 2024-08-01T20:20:58Z

✅ With the latest revision this PR passed the C/C++ code formatter.

gonzalobg · 2024-08-02T08:26:47Z

@Artem-B This is green!

Artem-B

LGTM.
Let's wait till Monday to land it.

llvm/lib/Target/NVPTX/NVPTX.h

llvm/lib/Target/NVPTX/NVPTXUtilities.h

mikaelholmen · 2024-08-13T07:41:20Z

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

+    Ops.append({BasePtr, Chain});
  }

+  SDNode *NVPTXST = NVPTXST =


One assignment too much here?
gcc detects this and warns like

../lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp:1977:29: warning: operation on 'NVPTXST' may be undefined [-Wsequence-point] 1977 | SDNode *NVPTXST = NVPTXST = | ~~~~~~~~^ 1978 | CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops); | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Indeed, thank you for reporting this.
Fix here: #103045

@mikaelholmen

Solves typo found by @mikaelholmen here: #98551 (comment)

llvmbot added the backend:NVPTX label Jul 11, 2024

Artem-B self-requested a review July 11, 2024 21:51

gonzalobg force-pushed the nvptx_volta_seq_cst branch 5 times, most recently from 78e3b5e to 2ecbb77 Compare July 15, 2024 22:12

gonzalobg mentioned this pull request Jul 19, 2024

[NVPTX] Support fence instruction #99649

Merged

gonzalobg force-pushed the nvptx_volta_seq_cst branch from 2ecbb77 to 25e94e7 Compare July 19, 2024 14:26

gonzalobg force-pushed the nvptx_volta_seq_cst branch 2 times, most recently from 696c71e to 3fd60bd Compare July 25, 2024 18:59

Artem-B reviewed Jul 25, 2024

View reviewed changes

gonzalobg force-pushed the nvptx_volta_seq_cst branch from 2574b35 to a3d9507 Compare July 30, 2024 14:01

gonzalobg added 12 commits July 31, 2024 00:35

[NVPTX] Support fence instruction

9b965ba

[NVPTX] Volta SequentiallyConsistent Load/Store Ops

6616144

[NVPTX] Align Memory Ordering enum with LLVM

c2711d1

[NVPTX]: Fix typos

8299d83

[NVPTX] Cleanup SeqCst Load/Store

8066f50

[NVPTX] Add vector load/store tests and refactor load/store tests

3f3fd6d

[NVPTX] Cleanups

6b6ef48

- Remove dead code in InstPrinter - Capitalization and improve Table comments - Update fence->membar lowering

[NVPTX] Update atomic volatile unordered test

41fb361

[NVPTX] Update comment: now handling Unordered

4cc2825

[NVPTX] refactor NVPTX::Ordering to string/stream

932a902

[NVPTX] Refactor OperationOrderings into anonymous namespace to avoid…

5dc3e00

… collisions

[NVPTX] Cleanup error reporting

e36d7c4

[NVPTX] Cleanup comments in tests

c9a5dd8

gonzalobg force-pushed the nvptx_volta_seq_cst branch from a3d9507 to c9a5dd8 Compare July 31, 2024 07:35

Artem-B reviewed Jul 31, 2024

View reviewed changes

gonzalobg added 6 commits July 31, 2024 13:03

[NVPTX] Refactor and clean up load,tryLoad,tryStore a bit

e727c76

[NVPTX] Simplify NVPTX::Ordering by not making it an enum class

7ec8da6

[NVPTX] Move Ordering to string functions to utilities

8b3e450

[NVPTX] Improve comments in load-store tests

2d8ab20

[NVPTX] Refactor fence insertion for loads/stores

3df9d66

[NVPTX] Refactor memory ordering

20be14b

Artem-B reviewed Aug 1, 2024

View reviewed changes

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp Show resolved Hide resolved

[NVPTX] Refactor and cleanups

e865fc3

[NVPTX] Switch to std::string

8738385

Artem-B approved these changes Aug 2, 2024

View reviewed changes

llvm/lib/Target/NVPTX/NVPTX.h Outdated Show resolved Hide resolved

llvm/lib/Target/NVPTX/NVPTXUtilities.h Outdated Show resolved Hide resolved

[NVPTX] Ordering uses AtomicOrdering values directly

ba80e0f

Artem-B approved these changes Aug 6, 2024

View reviewed changes

Artem-B merged commit f55abd5 into llvm:main Aug 6, 2024
7 checks passed

mikaelholmen reviewed Aug 13, 2024

View reviewed changes

gonzalobg mentioned this pull request Aug 13, 2024

[NVPTX] Fix typo causing GCC warning #103045

Merged

Artem-B pushed a commit that referenced this pull request Aug 14, 2024

[NVPTX] Fix typo causing GCC warning (#103045)

bdf4b9a

Solves typo found by @mikaelholmen here: #98551 (comment)

Artem-B mentioned this pull request Nov 26, 2024

Crash during nvptx codegen/instruction selection #117606

Closed

[NVPTX] Add Volta Atomic SequentiallyConsistent Load and Store Operations #98551

[NVPTX] Add Volta Atomic SequentiallyConsistent Load and Store Operations #98551

Uh oh!

Conversation

gonzalobg commented Jul 11, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Jul 11, 2024

Uh oh!

gonzalobg commented Jul 11, 2024

Uh oh!

Artem-B commented Jul 11, 2024

Uh oh!

gonzalobg commented Jul 19, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

gonzalobg commented Jul 25, 2024

Uh oh!

Artem-B left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Artem-B Jul 31, 2024

Choose a reason for hiding this comment

Uh oh!

gonzalobg Jul 31, 2024

Choose a reason for hiding this comment

Uh oh!

Artem-B Jul 31, 2024

Choose a reason for hiding this comment

Uh oh!

gonzalobg Aug 1, 2024

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Artem-B left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Artem-B Jul 31, 2024

Choose a reason for hiding this comment

Uh oh!

Uh oh!

github-actions bot commented Aug 1, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

gonzalobg commented Aug 2, 2024

Uh oh!

Artem-B left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

mikaelholmen Aug 13, 2024

Choose a reason for hiding this comment

Uh oh!

gonzalobg Aug 13, 2024

Choose a reason for hiding this comment

Uh oh!

Uh oh!

gonzalobg commented Jul 11, 2024 •

edited

Loading

gonzalobg commented Jul 19, 2024 •

edited

Loading

github-actions bot commented Aug 1, 2024 •

edited

Loading