Skip to content

[offload] Allow replay repetitions and report basic timing#193388

Open
kevinsala wants to merge 3 commits intollvm:mainfrom
kevinsala:kernel-record-replay-additions
Open

[offload] Allow replay repetitions and report basic timing#193388
kevinsala wants to merge 3 commits intollvm:mainfrom
kevinsala:kernel-record-replay-additions

Conversation

@kevinsala
Copy link
Copy Markdown
Contributor

@kevinsala kevinsala commented Apr 22, 2026

This PR extends the kernel replay tool to perform multiple replay repetitions on the same process. It also prints the execution time of the kernel replay, which includes the kernel launch and kernel synchronization (replay I/O time is excluded). Precise kernel timing should be obtained through the corresponding vendor tools.

The output report after recording has been improved as well.

@llvmbot
Copy link
Copy Markdown
Member

llvmbot commented Apr 22, 2026

@llvm/pr-subscribers-offload

Author: Kevin Sala Penades (kevinsala)

Changes

This PR extends the kernel replay tool to perform multiple repetitions of the replay. It also prints the execution time of the kernel replay. The timing includes the kernel launch and synchronization. Precise kernel timing should be obtained through the corresponding vendor tools.


Full diff: https://github.com/llvm/llvm-project/pull/193388.diff

8 Files Affected:

  • (modified) offload/include/Shared/APITypes.h (+6)
  • (modified) offload/include/omptarget.h (+5-4)
  • (modified) offload/libomptarget/interface.cpp (+13-8)
  • (modified) offload/libomptarget/omptarget.cpp (+10-2)
  • (modified) offload/libomptarget/private.h (+8-9)
  • (modified) offload/plugins-nextgen/common/include/RecordReplay.h (+25)
  • (modified) offload/plugins-nextgen/common/src/RecordReplay.cpp (+54-18)
  • (modified) offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp (+37-18)
diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index 40824596c3b9b..5e61bc7c842e7 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -134,6 +134,12 @@ struct KernelReplayOutcomeTy {
   /// The path to the file that stores the output memory snapshot after the
   /// kernel has been replayed.
   llvm::SmallString<128> OutputFilepath;
+  /// The execution time of the kernel replay in nanoseconds. This time includes
+  /// the the kernel launch and synchronization time. Replay I/O is excluded.
+  uint64_t KernelReplayTimeNs = 0;
+  /// The pointer to the device memory allocation used to replay. This can be
+  /// reused for future replays of the same kernel.
+  void *ReplayDeviceAlloc = nullptr;
 };
 
 /// Extra kernel arguments managed by the runtime components. Notice these
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index 0234e8fc55245..e5d9852ad48a6 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -428,10 +428,11 @@ void __tgt_target_nowait_query(void **AsyncHandle);
 /// device memory.
 int __tgt_target_kernel_replay(
     ident_t *Loc, int64_t DeviceId, void *HostPtr, void *DeviceMemory,
-    int64_t DeviceMemorySize, const llvm::offloading::EntryTy *Globals,
-    int32_t NumGlobals, void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs,
-    int32_t NumTeams, int32_t ThreadLimit, uint32_t SharedMemorySize,
-    uint64_t LoopTripCount, KernelReplayOutcomeTy *ReplayOutcome);
+    void *ReuseDeviceAlloc, int64_t DeviceMemorySize,
+    const llvm::offloading::EntryTy *Globals, int32_t NumGlobals,
+    void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams,
+    int32_t ThreadLimit, uint32_t SharedMemorySize, uint64_t LoopTripCount,
+    KernelReplayOutcomeTy *ReplayOutcome);
 
 void __tgt_set_info_flag(uint32_t);
 
diff --git a/offload/libomptarget/interface.cpp b/offload/libomptarget/interface.cpp
index 9dd206d140c18..1049cd72d0958 100644
--- a/offload/libomptarget/interface.cpp
+++ b/offload/libomptarget/interface.cpp
@@ -509,6 +509,9 @@ EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
 /// \param DeviceMemory A pointer to an array storing device memory data to move
 ///                     prior to kernel execution.
 /// \param DeviceMemorySize The size of the above device memory data in bytes.
+/// \param ReuseDeviceAlloc Pointer to a device memory allocation that should
+///                            be reused for the replay. If null, the replay
+///                            will allocate the necessary device buffer.
 /// \param TgtArgs An array of pointers of the pre-recorded target kernel
 ///                arguments.
 /// \param TgtOffsets An array of pointers of the pre-recorded target kernel
@@ -521,10 +524,11 @@ EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
 /// \return OMP_TGT_SUCCESS on success, OMP_TGT_FAIL on failure.
 EXTERN int __tgt_target_kernel_replay(
     ident_t *Loc, int64_t DeviceId, void *HostPtr, void *DeviceMemory,
-    int64_t DeviceMemorySize, const llvm::offloading::EntryTy *Globals,
-    int32_t NumGlobals, void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs,
-    int32_t NumTeams, int32_t ThreadLimit, uint32_t SharedMemorySize,
-    uint64_t LoopTripCount, KernelReplayOutcomeTy *ReplayOutcome) {
+    void *ReuseDeviceAlloc, int64_t DeviceMemorySize,
+    const llvm::offloading::EntryTy *Globals, int32_t NumGlobals,
+    void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams,
+    int32_t ThreadLimit, uint32_t SharedMemorySize, uint64_t LoopTripCount,
+    KernelReplayOutcomeTy *ReplayOutcome) {
   assert(PM && "Runtime not initialized");
   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
   if (checkDevice(DeviceId, Loc)) {
@@ -541,10 +545,11 @@ EXTERN int __tgt_target_kernel_replay(
                     /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
 
   AsyncInfoTy AsyncInfo(*DeviceOrErr);
-  int Rc = target_replay(
-      Loc, *DeviceOrErr, HostPtr, DeviceMemory, DeviceMemorySize, Globals,
-      NumGlobals, TgtArgs, TgtOffsets, NumArgs, NumTeams, ThreadLimit,
-      SharedMemorySize, LoopTripCount, AsyncInfo, ReplayOutcome);
+  int Rc =
+      target_replay(Loc, *DeviceOrErr, HostPtr, DeviceMemory, DeviceMemorySize,
+                    ReuseDeviceAlloc, Globals, NumGlobals, TgtArgs, TgtOffsets,
+                    NumArgs, NumTeams, ThreadLimit, SharedMemorySize,
+                    LoopTripCount, AsyncInfo, ReplayOutcome);
 
   if (Rc == OFFLOAD_SUCCESS)
     Rc = AsyncInfo.synchronize();
diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp
index f06654c639a8e..82a0ed73317d5 100644
--- a/offload/libomptarget/omptarget.cpp
+++ b/offload/libomptarget/omptarget.cpp
@@ -2391,6 +2391,7 @@ int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, void *VAddr,
 /// configuration.
 int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
                   void *DeviceMemory, int64_t DeviceMemorySize,
+                  void *ReuseDeviceAlloc,
                   const llvm::offloading::EntryTy *Globals, int32_t NumGlobals,
                   void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs,
                   int32_t NumTeams, int32_t ThreadLimit,
@@ -2448,13 +2449,20 @@ int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
     }
   }
 
-  void *TgtPtr = Device.allocData(DeviceMemorySize, /*HstPtr=*/nullptr,
-                                  TARGET_ALLOC_DEFAULT);
+  // Reuse a previous device allocation or allocate a new device buffer.
+  void *&TgtPtr = ReuseDeviceAlloc;
+  if (!TgtPtr)
+    TgtPtr = Device.allocData(DeviceMemorySize, /*HstPtr=*/nullptr,
+                              TARGET_ALLOC_DEFAULT);
   if (!TgtPtr) {
     REPORT() << "Failed to allocate device memory.";
     return OFFLOAD_FAIL;
   }
 
+  // Save the device allocation for future replays of the same kernel.
+  if (ReplayOutcome)
+    ReplayOutcome->ReplayDeviceAlloc = TgtPtr;
+
   int Ret =
       Device.submitData(TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo);
   if (Ret != OFFLOAD_SUCCESS) {
diff --git a/offload/libomptarget/private.h b/offload/libomptarget/private.h
index 31b295bda613e..e52028cc060d9 100644
--- a/offload/libomptarget/private.h
+++ b/offload/libomptarget/private.h
@@ -30,15 +30,14 @@ extern int target_activate_rr(DeviceTy &Device, uint64_t MemorySize,
                               void *ReqAddr, bool IsRecord, bool SaveOutput,
                               bool EmitReport, const char *OutputDirPath);
 
-extern int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
-                         void *DeviceMemory, int64_t DeviceMemorySize,
-                         const llvm::offloading::EntryTy *Globals,
-                         int32_t NumGlobals, void **TgtArgs,
-                         ptrdiff_t *TgtOffsets, int32_t NumArgs,
-                         int32_t NumTeams, int32_t ThreadLimit,
-                         uint32_t SharedMemorySize, uint64_t LoopTripCount,
-                         AsyncInfoTy &AsyncInfo,
-                         KernelReplayOutcomeTy *ReplayOutcome);
+extern int
+target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr, void *DeviceMemory,
+              int64_t DeviceMemorySize, void *ReuseDeviceAlloc,
+              const llvm::offloading::EntryTy *Globals, int32_t NumGlobals,
+              void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs,
+              int32_t NumTeams, int32_t ThreadLimit, uint32_t SharedMemorySize,
+              uint64_t LoopTripCount, AsyncInfoTy &AsyncInfo,
+              KernelReplayOutcomeTy *ReplayOutcome);
 
 extern void handleTargetOutcome(bool Success, ident_t *Loc);
 
diff --git a/offload/plugins-nextgen/common/include/RecordReplay.h b/offload/plugins-nextgen/common/include/RecordReplay.h
index 0929a533effa4..65a861cc8a0cc 100644
--- a/offload/plugins-nextgen/common/include/RecordReplay.h
+++ b/offload/plugins-nextgen/common/include/RecordReplay.h
@@ -11,6 +11,7 @@
 #ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_RECORDREPLAY_H
 #define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_RECORDREPLAY_H
 
+#include <chrono>
 #include <cstddef>
 #include <cstdint>
 #include <filesystem>
@@ -116,6 +117,10 @@ struct RecordReplayTy {
     /// information about the the kernel's replay, such as the snapshot file.
     KernelReplayOutcomeTy *ReplayOutcome = nullptr;
 
+    /// The begin and end time points of the kernel execution.
+    using ClockTy = std::chrono::steady_clock;
+    mutable std::chrono::time_point<ClockTy> BeginTime, EndTime;
+
     /// The number of occurrences during the execution.
     mutable size_t Occurrences = 0;
 
@@ -129,6 +134,17 @@ struct RecordReplayTy {
               NumTeams == Other.NumTeams && NumThreads == Other.NumThreads &&
               SharedMemorySize == Other.SharedMemorySize);
     }
+
+    /// Record the begin and ending of the kernel execution.
+    void recordBeginTime() const { BeginTime = ClockTy::now(); }
+    void recordEndTime() const { EndTime = ClockTy::now(); }
+
+    /// Get the kernel execution time in nanoseconds.
+    uint64_t getRecordedTimeNs() const {
+      using DurationNsTy = std::chrono::duration<uint64_t, std::nano>;
+      return std::chrono::duration_cast<DurationNsTy>(EndTime - BeginTime)
+          .count();
+    }
   };
 
   struct InstanceHasher {
@@ -210,6 +226,15 @@ struct RecordReplayTy {
                    uint32_t NumThreads, uint32_t SharedMemorySize,
                    KernelReplayOutcomeTy *ReplayOutcome);
 
+  /// Unregister an instance once it has been replayed. Instances during
+  /// recording cannot be unregistered. Accessing the instance beyond this point
+  /// is invalid.
+  Error unregisterInstance(const InstanceTy &Instance);
+
+  /// Populate the replay outcome struct to forward some replay information.
+  void populateReplayOutcome(const InstanceTy &Instance,
+                             KernelReplayOutcomeTy &Outcome);
+
   /// Record the prologue data.
   virtual Error
   recordPrologueImpl(const GenericKernelTy &Kernel, const InstanceTy &Instance,
diff --git a/offload/plugins-nextgen/common/src/RecordReplay.cpp b/offload/plugins-nextgen/common/src/RecordReplay.cpp
index 0b03e3405ce58..8c4b012fdc9e2 100644
--- a/offload/plugins-nextgen/common/src/RecordReplay.cpp
+++ b/offload/plugins-nextgen/common/src/RecordReplay.cpp
@@ -89,17 +89,20 @@ Error RecordReplayTy::deinit() {
 
 Error RecordReplayTy::emitInstanceReport() {
   std::lock_guard<std::mutex> LG(InstancesLock);
-  llvm::outs() << "=== record report begin ===\n";
-  llvm::outs() << "directory: "
+  llvm::outs() << "=== Kernel Record Report ===\n";
+  llvm::outs() << "Directory: "
                << std::filesystem::absolute(OutputDirectory).string() << "\n";
-  llvm::outs() << "kernels: " << Instances.size() << "\n";
+  llvm::outs() << "Total Instances: " << Instances.size() << "\n";
+  llvm::outs() << "JSON Filename, Kernel Name, Time (ns), Occurrences:\n";
 
   SmallString<128> Filename;
   for (const auto &Inst : Instances)
     llvm::outs()
         << getFilename(Inst, FileTy::Descriptor, /*IncludeDir=*/false).c_str()
-        << ": " << Inst.Kernel.getName() << "\n";
-  llvm::outs() << "=== record report end ===\n";
+        << ", " << Inst.Kernel.getName() << ", " << Inst.getRecordedTimeNs()
+        << ", " << Inst.Occurrences << "\n";
+  llvm::outs() << "=== End Kernel Record Report ===\n";
+
   return Plugin::success();
 }
 
@@ -116,6 +119,16 @@ RecordReplayTy::registerInstance(const GenericKernelTy &Kernel,
   return {*It, Inserted};
 }
 
+Error RecordReplayTy::unregisterInstance(const InstanceTy &Instance) {
+  assert(isReplaying() && "Cannot unregister instance when recording.");
+
+  std::lock_guard<std::mutex> LG(InstancesLock);
+  size_t Erased = Instances.erase(Instance);
+  if (Erased != 1)
+    return Plugin::error(ErrorCode::INVALID_ARGUMENT, "invalid instance");
+  return Plugin::success();
+}
+
 Expected<void *> RecordReplayTy::allocate(uint64_t Size) {
   assert(StartAddr && "Expected memory has been pre-allocated");
   constexpr int Alignment = 16;
@@ -147,34 +160,57 @@ Expected<RecordReplayTy::HandleTy> RecordReplayTy::recordPrologue(
       (KernelExtraArgs) ? KernelExtraArgs->ReplayOutcome : nullptr);
 
   HandleTy Handle{&Instance, First};
-  if (isReplaying() || !First)
+  if (!First)
     return Handle;
 
-  if (auto Err = recordDescImpl(Kernel, Instance, KernelArgs, LaunchParams))
-    return Err;
+  if (isRecording()) {
+    if (auto Err = recordDescImpl(Kernel, Instance, KernelArgs, LaunchParams))
+      return Err;
 
-  if (auto Err = recordPrologueImpl(Kernel, Instance, KernelArgs, LaunchParams))
-    return Err;
+    if (auto Err =
+            recordPrologueImpl(Kernel, Instance, KernelArgs, LaunchParams))
+      return Err;
+  }
+
+  // Start the timer for the kernel execution.
+  Instance.recordBeginTime();
 
   return Handle;
 }
 
 Error RecordReplayTy::recordEpilogue(const GenericKernelTy &Kernel,
                                      HandleTy Handle) {
-  if (!shouldRecordEpilogue() || !Handle.Active)
+  if (!Handle.Active)
     return Plugin::success();
 
+  // Stop the timer for the kernel execution.
   const InstanceTy &Instance = *Handle.Instance;
-  if (auto Err = recordEpilogueImpl(Kernel, Instance))
-    return Err;
+  Instance.recordEndTime();
+
+  if (shouldRecordEpilogue())
+    if (auto Err = recordEpilogueImpl(Kernel, Instance))
+      return Err;
+
+  if (isReplaying() && Instance.ReplayOutcome)
+    populateReplayOutcome(Instance, *Instance.ReplayOutcome);
 
-  // If necessary, inform the replaying tool about where the epilogue snapshot
-  // file has been stored.
-  if (isReplaying() && Instance.ReplayOutcome) {
+  // After a replay, unregister the instance so it can be replayed again. Do
+  // not access the instance object beyond this point.
+  if (isReplaying())
+    return unregisterInstance(Instance);
+
+  return Plugin::success();
+}
+
+void RecordReplayTy::populateReplayOutcome(const InstanceTy &Instance,
+                                           KernelReplayOutcomeTy &Outcome) {
+  // Only save the epilogue output filename if it was recorded.
+  if (shouldRecordEpilogue()) {
     SmallString<128> Filename = getFilename(Instance, FileTy::EpilogueSnapshot);
-    Instance.ReplayOutcome->OutputFilepath = Filename;
+    Outcome.OutputFilepath = Filename;
   }
-  return Plugin::success();
+  // Save the kernel replay time.
+  Outcome.KernelReplayTimeNs = Instance.getRecordedTimeNs();
 }
 
 Error NativeRecordReplayTy::recordPrologueImpl(
diff --git a/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp b/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp
index 7e8ceb7c24c08..7526a81b450d8 100644
--- a/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp
+++ b/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp
@@ -58,6 +58,11 @@ static cl::opt<uint32_t> NumThreadsOpt("num-threads",
 static cl::opt<int32_t> DeviceIdOpt("device-id", cl::desc("Set the device id."),
                                     cl::init(-1), cl::cat(ReplayOptions));
 
+static cl::opt<uint32_t>
+    RepetitionsOpt("repetitions",
+                   cl::desc("Set the number of replay repetitions."),
+                   cl::init(1), cl::cat(ReplayOptions));
+
 template <typename... ArgsTy>
 Error createErr(const char *ErrFmt, ArgsTy &&...Args) {
   return llvm::createStringError(llvm::inconvertibleErrorCode(), ErrFmt,
@@ -132,7 +137,6 @@ Error verifyReplayOutput(StringRef RecordOutputFilename,
     return createErr("replay device memory failed to verify");
 
   // Sucessfully verified.
-  outs() << TOOL_PREFIX << "Replay device memory verified\n";
   return Error::success();
 }
 
@@ -315,26 +319,41 @@ Error replayKernel() {
   auto RecordInputBuffer = std::move(RecordInputBufferOrErr.get());
 
   KernelReplayOutcomeTy Outcome;
-  Rc = __tgt_target_kernel_replay(
-      /*Loc=*/nullptr, DeviceId, OffloadEntries[0].Address,
-      const_cast<char *>(RecordInputBuffer->getBufferStart()),
-      RecordInputBuffer->getBufferSize(),
-      NumGlobals ? &OffloadEntries[1] : nullptr, NumGlobals, TgtArgs.data(),
-      TgtArgOffsets.data(), NumArgs, NumTeams, NumThreads, SharedMemorySize,
-      LoopTripCount, &Outcome);
-  if (Rc != OMP_TGT_SUCCESS)
-    return createErr("failed to replay kernel");
 
-  // Verify the replay output if requested.
-  if (VerifyOpt) {
-    if (Outcome.OutputFilepath.empty())
-      return createErr("replay output file was not generated");
-
-    Filepath.replace_extension("record_output");
-    return verifyReplayOutput(Filepath.c_str(), Outcome.OutputFilepath.c_str());
+  // Perform the kernel replay and verification (if needed) for each repetition.
+  for (uint32_t R = 0; R < RepetitionsOpt; ++R) {
+    Rc = __tgt_target_kernel_replay(
+        /*Loc=*/nullptr, DeviceId, OffloadEntries[0].Address,
+        const_cast<char *>(RecordInputBuffer->getBufferStart()),
+        R > 0 ? Outcome.ReplayDeviceAlloc : nullptr,
+        RecordInputBuffer->getBufferSize(),
+        NumGlobals ? &OffloadEntries[1] : nullptr, NumGlobals, TgtArgs.data(),
+        TgtArgOffsets.data(), NumArgs, NumTeams, NumThreads, SharedMemorySize,
+        LoopTripCount, &Outcome);
+    if (Rc != OMP_TGT_SUCCESS)
+      return createErr("failed to replay kernel");
+
+    // Verify the replay output if requested.
+    if (VerifyOpt) {
+      if (Outcome.OutputFilepath.empty())
+        return createErr("replay output file was not generated");
+
+      Filepath.replace_extension("record_output");
+      if (auto Err = verifyReplayOutput(Filepath.c_str(),
+                                        Outcome.OutputFilepath.c_str()))
+        return Err;
+    }
+
+    outs() << TOOL_PREFIX << " Replay time (" << R
+           << "): " << Outcome.KernelReplayTimeNs << " ns\n";
   }
 
-  outs() << TOOL_PREFIX << "Replay finished (verification skipped)\n";
+  // At this point, any verification done was successful.
+  if (VerifyOpt)
+    outs() << TOOL_PREFIX << " Replay done, device memory verified\n";
+  else
+    outs() << TOOL_PREFIX << " Replay done, verification skipped\n";
+
   return Error::success();
 }
 

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants