[offload] Allow replay repetitions and report basic timing#193388
Open
[offload] Allow replay repetitions and report basic timing#193388
Conversation
Member
|
@llvm/pr-subscribers-offload Author: Kevin Sala Penades (kevinsala) ChangesThis PR extends the kernel replay tool to perform multiple repetitions of the replay. It also prints the execution time of the kernel replay. The timing includes the kernel launch and synchronization. Precise kernel timing should be obtained through the corresponding vendor tools. Full diff: https://github.com/llvm/llvm-project/pull/193388.diff 8 Files Affected:
diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index 40824596c3b9b..5e61bc7c842e7 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -134,6 +134,12 @@ struct KernelReplayOutcomeTy {
/// The path to the file that stores the output memory snapshot after the
/// kernel has been replayed.
llvm::SmallString<128> OutputFilepath;
+ /// The execution time of the kernel replay in nanoseconds. This time includes
+ /// the the kernel launch and synchronization time. Replay I/O is excluded.
+ uint64_t KernelReplayTimeNs = 0;
+ /// The pointer to the device memory allocation used to replay. This can be
+ /// reused for future replays of the same kernel.
+ void *ReplayDeviceAlloc = nullptr;
};
/// Extra kernel arguments managed by the runtime components. Notice these
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index 0234e8fc55245..e5d9852ad48a6 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -428,10 +428,11 @@ void __tgt_target_nowait_query(void **AsyncHandle);
/// device memory.
int __tgt_target_kernel_replay(
ident_t *Loc, int64_t DeviceId, void *HostPtr, void *DeviceMemory,
- int64_t DeviceMemorySize, const llvm::offloading::EntryTy *Globals,
- int32_t NumGlobals, void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs,
- int32_t NumTeams, int32_t ThreadLimit, uint32_t SharedMemorySize,
- uint64_t LoopTripCount, KernelReplayOutcomeTy *ReplayOutcome);
+ void *ReuseDeviceAlloc, int64_t DeviceMemorySize,
+ const llvm::offloading::EntryTy *Globals, int32_t NumGlobals,
+ void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams,
+ int32_t ThreadLimit, uint32_t SharedMemorySize, uint64_t LoopTripCount,
+ KernelReplayOutcomeTy *ReplayOutcome);
void __tgt_set_info_flag(uint32_t);
diff --git a/offload/libomptarget/interface.cpp b/offload/libomptarget/interface.cpp
index 9dd206d140c18..1049cd72d0958 100644
--- a/offload/libomptarget/interface.cpp
+++ b/offload/libomptarget/interface.cpp
@@ -509,6 +509,9 @@ EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
/// \param DeviceMemory A pointer to an array storing device memory data to move
/// prior to kernel execution.
/// \param DeviceMemorySize The size of the above device memory data in bytes.
+/// \param ReuseDeviceAlloc Pointer to a device memory allocation that should
+/// be reused for the replay. If null, the replay
+/// will allocate the necessary device buffer.
/// \param TgtArgs An array of pointers of the pre-recorded target kernel
/// arguments.
/// \param TgtOffsets An array of pointers of the pre-recorded target kernel
@@ -521,10 +524,11 @@ EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
/// \return OMP_TGT_SUCCESS on success, OMP_TGT_FAIL on failure.
EXTERN int __tgt_target_kernel_replay(
ident_t *Loc, int64_t DeviceId, void *HostPtr, void *DeviceMemory,
- int64_t DeviceMemorySize, const llvm::offloading::EntryTy *Globals,
- int32_t NumGlobals, void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs,
- int32_t NumTeams, int32_t ThreadLimit, uint32_t SharedMemorySize,
- uint64_t LoopTripCount, KernelReplayOutcomeTy *ReplayOutcome) {
+ void *ReuseDeviceAlloc, int64_t DeviceMemorySize,
+ const llvm::offloading::EntryTy *Globals, int32_t NumGlobals,
+ void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams,
+ int32_t ThreadLimit, uint32_t SharedMemorySize, uint64_t LoopTripCount,
+ KernelReplayOutcomeTy *ReplayOutcome) {
assert(PM && "Runtime not initialized");
OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
if (checkDevice(DeviceId, Loc)) {
@@ -541,10 +545,11 @@ EXTERN int __tgt_target_kernel_replay(
/*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
AsyncInfoTy AsyncInfo(*DeviceOrErr);
- int Rc = target_replay(
- Loc, *DeviceOrErr, HostPtr, DeviceMemory, DeviceMemorySize, Globals,
- NumGlobals, TgtArgs, TgtOffsets, NumArgs, NumTeams, ThreadLimit,
- SharedMemorySize, LoopTripCount, AsyncInfo, ReplayOutcome);
+ int Rc =
+ target_replay(Loc, *DeviceOrErr, HostPtr, DeviceMemory, DeviceMemorySize,
+ ReuseDeviceAlloc, Globals, NumGlobals, TgtArgs, TgtOffsets,
+ NumArgs, NumTeams, ThreadLimit, SharedMemorySize,
+ LoopTripCount, AsyncInfo, ReplayOutcome);
if (Rc == OFFLOAD_SUCCESS)
Rc = AsyncInfo.synchronize();
diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp
index f06654c639a8e..82a0ed73317d5 100644
--- a/offload/libomptarget/omptarget.cpp
+++ b/offload/libomptarget/omptarget.cpp
@@ -2391,6 +2391,7 @@ int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, void *VAddr,
/// configuration.
int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
void *DeviceMemory, int64_t DeviceMemorySize,
+ void *ReuseDeviceAlloc,
const llvm::offloading::EntryTy *Globals, int32_t NumGlobals,
void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs,
int32_t NumTeams, int32_t ThreadLimit,
@@ -2448,13 +2449,20 @@ int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
}
}
- void *TgtPtr = Device.allocData(DeviceMemorySize, /*HstPtr=*/nullptr,
- TARGET_ALLOC_DEFAULT);
+ // Reuse a previous device allocation or allocate a new device buffer.
+ void *&TgtPtr = ReuseDeviceAlloc;
+ if (!TgtPtr)
+ TgtPtr = Device.allocData(DeviceMemorySize, /*HstPtr=*/nullptr,
+ TARGET_ALLOC_DEFAULT);
if (!TgtPtr) {
REPORT() << "Failed to allocate device memory.";
return OFFLOAD_FAIL;
}
+ // Save the device allocation for future replays of the same kernel.
+ if (ReplayOutcome)
+ ReplayOutcome->ReplayDeviceAlloc = TgtPtr;
+
int Ret =
Device.submitData(TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo);
if (Ret != OFFLOAD_SUCCESS) {
diff --git a/offload/libomptarget/private.h b/offload/libomptarget/private.h
index 31b295bda613e..e52028cc060d9 100644
--- a/offload/libomptarget/private.h
+++ b/offload/libomptarget/private.h
@@ -30,15 +30,14 @@ extern int target_activate_rr(DeviceTy &Device, uint64_t MemorySize,
void *ReqAddr, bool IsRecord, bool SaveOutput,
bool EmitReport, const char *OutputDirPath);
-extern int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
- void *DeviceMemory, int64_t DeviceMemorySize,
- const llvm::offloading::EntryTy *Globals,
- int32_t NumGlobals, void **TgtArgs,
- ptrdiff_t *TgtOffsets, int32_t NumArgs,
- int32_t NumTeams, int32_t ThreadLimit,
- uint32_t SharedMemorySize, uint64_t LoopTripCount,
- AsyncInfoTy &AsyncInfo,
- KernelReplayOutcomeTy *ReplayOutcome);
+extern int
+target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr, void *DeviceMemory,
+ int64_t DeviceMemorySize, void *ReuseDeviceAlloc,
+ const llvm::offloading::EntryTy *Globals, int32_t NumGlobals,
+ void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs,
+ int32_t NumTeams, int32_t ThreadLimit, uint32_t SharedMemorySize,
+ uint64_t LoopTripCount, AsyncInfoTy &AsyncInfo,
+ KernelReplayOutcomeTy *ReplayOutcome);
extern void handleTargetOutcome(bool Success, ident_t *Loc);
diff --git a/offload/plugins-nextgen/common/include/RecordReplay.h b/offload/plugins-nextgen/common/include/RecordReplay.h
index 0929a533effa4..65a861cc8a0cc 100644
--- a/offload/plugins-nextgen/common/include/RecordReplay.h
+++ b/offload/plugins-nextgen/common/include/RecordReplay.h
@@ -11,6 +11,7 @@
#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_RECORDREPLAY_H
#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_RECORDREPLAY_H
+#include <chrono>
#include <cstddef>
#include <cstdint>
#include <filesystem>
@@ -116,6 +117,10 @@ struct RecordReplayTy {
/// information about the the kernel's replay, such as the snapshot file.
KernelReplayOutcomeTy *ReplayOutcome = nullptr;
+ /// The begin and end time points of the kernel execution.
+ using ClockTy = std::chrono::steady_clock;
+ mutable std::chrono::time_point<ClockTy> BeginTime, EndTime;
+
/// The number of occurrences during the execution.
mutable size_t Occurrences = 0;
@@ -129,6 +134,17 @@ struct RecordReplayTy {
NumTeams == Other.NumTeams && NumThreads == Other.NumThreads &&
SharedMemorySize == Other.SharedMemorySize);
}
+
+ /// Record the begin and ending of the kernel execution.
+ void recordBeginTime() const { BeginTime = ClockTy::now(); }
+ void recordEndTime() const { EndTime = ClockTy::now(); }
+
+ /// Get the kernel execution time in nanoseconds.
+ uint64_t getRecordedTimeNs() const {
+ using DurationNsTy = std::chrono::duration<uint64_t, std::nano>;
+ return std::chrono::duration_cast<DurationNsTy>(EndTime - BeginTime)
+ .count();
+ }
};
struct InstanceHasher {
@@ -210,6 +226,15 @@ struct RecordReplayTy {
uint32_t NumThreads, uint32_t SharedMemorySize,
KernelReplayOutcomeTy *ReplayOutcome);
+ /// Unregister an instance once it has been replayed. Instances during
+ /// recording cannot be unregistered. Accessing the instance beyond this point
+ /// is invalid.
+ Error unregisterInstance(const InstanceTy &Instance);
+
+ /// Populate the replay outcome struct to forward some replay information.
+ void populateReplayOutcome(const InstanceTy &Instance,
+ KernelReplayOutcomeTy &Outcome);
+
/// Record the prologue data.
virtual Error
recordPrologueImpl(const GenericKernelTy &Kernel, const InstanceTy &Instance,
diff --git a/offload/plugins-nextgen/common/src/RecordReplay.cpp b/offload/plugins-nextgen/common/src/RecordReplay.cpp
index 0b03e3405ce58..8c4b012fdc9e2 100644
--- a/offload/plugins-nextgen/common/src/RecordReplay.cpp
+++ b/offload/plugins-nextgen/common/src/RecordReplay.cpp
@@ -89,17 +89,20 @@ Error RecordReplayTy::deinit() {
Error RecordReplayTy::emitInstanceReport() {
std::lock_guard<std::mutex> LG(InstancesLock);
- llvm::outs() << "=== record report begin ===\n";
- llvm::outs() << "directory: "
+ llvm::outs() << "=== Kernel Record Report ===\n";
+ llvm::outs() << "Directory: "
<< std::filesystem::absolute(OutputDirectory).string() << "\n";
- llvm::outs() << "kernels: " << Instances.size() << "\n";
+ llvm::outs() << "Total Instances: " << Instances.size() << "\n";
+ llvm::outs() << "JSON Filename, Kernel Name, Time (ns), Occurrences:\n";
SmallString<128> Filename;
for (const auto &Inst : Instances)
llvm::outs()
<< getFilename(Inst, FileTy::Descriptor, /*IncludeDir=*/false).c_str()
- << ": " << Inst.Kernel.getName() << "\n";
- llvm::outs() << "=== record report end ===\n";
+ << ", " << Inst.Kernel.getName() << ", " << Inst.getRecordedTimeNs()
+ << ", " << Inst.Occurrences << "\n";
+ llvm::outs() << "=== End Kernel Record Report ===\n";
+
return Plugin::success();
}
@@ -116,6 +119,16 @@ RecordReplayTy::registerInstance(const GenericKernelTy &Kernel,
return {*It, Inserted};
}
+Error RecordReplayTy::unregisterInstance(const InstanceTy &Instance) {
+ assert(isReplaying() && "Cannot unregister instance when recording.");
+
+ std::lock_guard<std::mutex> LG(InstancesLock);
+ size_t Erased = Instances.erase(Instance);
+ if (Erased != 1)
+ return Plugin::error(ErrorCode::INVALID_ARGUMENT, "invalid instance");
+ return Plugin::success();
+}
+
Expected<void *> RecordReplayTy::allocate(uint64_t Size) {
assert(StartAddr && "Expected memory has been pre-allocated");
constexpr int Alignment = 16;
@@ -147,34 +160,57 @@ Expected<RecordReplayTy::HandleTy> RecordReplayTy::recordPrologue(
(KernelExtraArgs) ? KernelExtraArgs->ReplayOutcome : nullptr);
HandleTy Handle{&Instance, First};
- if (isReplaying() || !First)
+ if (!First)
return Handle;
- if (auto Err = recordDescImpl(Kernel, Instance, KernelArgs, LaunchParams))
- return Err;
+ if (isRecording()) {
+ if (auto Err = recordDescImpl(Kernel, Instance, KernelArgs, LaunchParams))
+ return Err;
- if (auto Err = recordPrologueImpl(Kernel, Instance, KernelArgs, LaunchParams))
- return Err;
+ if (auto Err =
+ recordPrologueImpl(Kernel, Instance, KernelArgs, LaunchParams))
+ return Err;
+ }
+
+ // Start the timer for the kernel execution.
+ Instance.recordBeginTime();
return Handle;
}
Error RecordReplayTy::recordEpilogue(const GenericKernelTy &Kernel,
HandleTy Handle) {
- if (!shouldRecordEpilogue() || !Handle.Active)
+ if (!Handle.Active)
return Plugin::success();
+ // Stop the timer for the kernel execution.
const InstanceTy &Instance = *Handle.Instance;
- if (auto Err = recordEpilogueImpl(Kernel, Instance))
- return Err;
+ Instance.recordEndTime();
+
+ if (shouldRecordEpilogue())
+ if (auto Err = recordEpilogueImpl(Kernel, Instance))
+ return Err;
+
+ if (isReplaying() && Instance.ReplayOutcome)
+ populateReplayOutcome(Instance, *Instance.ReplayOutcome);
- // If necessary, inform the replaying tool about where the epilogue snapshot
- // file has been stored.
- if (isReplaying() && Instance.ReplayOutcome) {
+ // After a replay, unregister the instance so it can be replayed again. Do
+ // not access the instance object beyond this point.
+ if (isReplaying())
+ return unregisterInstance(Instance);
+
+ return Plugin::success();
+}
+
+void RecordReplayTy::populateReplayOutcome(const InstanceTy &Instance,
+ KernelReplayOutcomeTy &Outcome) {
+ // Only save the epilogue output filename if it was recorded.
+ if (shouldRecordEpilogue()) {
SmallString<128> Filename = getFilename(Instance, FileTy::EpilogueSnapshot);
- Instance.ReplayOutcome->OutputFilepath = Filename;
+ Outcome.OutputFilepath = Filename;
}
- return Plugin::success();
+ // Save the kernel replay time.
+ Outcome.KernelReplayTimeNs = Instance.getRecordedTimeNs();
}
Error NativeRecordReplayTy::recordPrologueImpl(
diff --git a/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp b/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp
index 7e8ceb7c24c08..7526a81b450d8 100644
--- a/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp
+++ b/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp
@@ -58,6 +58,11 @@ static cl::opt<uint32_t> NumThreadsOpt("num-threads",
static cl::opt<int32_t> DeviceIdOpt("device-id", cl::desc("Set the device id."),
cl::init(-1), cl::cat(ReplayOptions));
+static cl::opt<uint32_t>
+ RepetitionsOpt("repetitions",
+ cl::desc("Set the number of replay repetitions."),
+ cl::init(1), cl::cat(ReplayOptions));
+
template <typename... ArgsTy>
Error createErr(const char *ErrFmt, ArgsTy &&...Args) {
return llvm::createStringError(llvm::inconvertibleErrorCode(), ErrFmt,
@@ -132,7 +137,6 @@ Error verifyReplayOutput(StringRef RecordOutputFilename,
return createErr("replay device memory failed to verify");
// Sucessfully verified.
- outs() << TOOL_PREFIX << "Replay device memory verified\n";
return Error::success();
}
@@ -315,26 +319,41 @@ Error replayKernel() {
auto RecordInputBuffer = std::move(RecordInputBufferOrErr.get());
KernelReplayOutcomeTy Outcome;
- Rc = __tgt_target_kernel_replay(
- /*Loc=*/nullptr, DeviceId, OffloadEntries[0].Address,
- const_cast<char *>(RecordInputBuffer->getBufferStart()),
- RecordInputBuffer->getBufferSize(),
- NumGlobals ? &OffloadEntries[1] : nullptr, NumGlobals, TgtArgs.data(),
- TgtArgOffsets.data(), NumArgs, NumTeams, NumThreads, SharedMemorySize,
- LoopTripCount, &Outcome);
- if (Rc != OMP_TGT_SUCCESS)
- return createErr("failed to replay kernel");
- // Verify the replay output if requested.
- if (VerifyOpt) {
- if (Outcome.OutputFilepath.empty())
- return createErr("replay output file was not generated");
-
- Filepath.replace_extension("record_output");
- return verifyReplayOutput(Filepath.c_str(), Outcome.OutputFilepath.c_str());
+ // Perform the kernel replay and verification (if needed) for each repetition.
+ for (uint32_t R = 0; R < RepetitionsOpt; ++R) {
+ Rc = __tgt_target_kernel_replay(
+ /*Loc=*/nullptr, DeviceId, OffloadEntries[0].Address,
+ const_cast<char *>(RecordInputBuffer->getBufferStart()),
+ R > 0 ? Outcome.ReplayDeviceAlloc : nullptr,
+ RecordInputBuffer->getBufferSize(),
+ NumGlobals ? &OffloadEntries[1] : nullptr, NumGlobals, TgtArgs.data(),
+ TgtArgOffsets.data(), NumArgs, NumTeams, NumThreads, SharedMemorySize,
+ LoopTripCount, &Outcome);
+ if (Rc != OMP_TGT_SUCCESS)
+ return createErr("failed to replay kernel");
+
+ // Verify the replay output if requested.
+ if (VerifyOpt) {
+ if (Outcome.OutputFilepath.empty())
+ return createErr("replay output file was not generated");
+
+ Filepath.replace_extension("record_output");
+ if (auto Err = verifyReplayOutput(Filepath.c_str(),
+ Outcome.OutputFilepath.c_str()))
+ return Err;
+ }
+
+ outs() << TOOL_PREFIX << " Replay time (" << R
+ << "): " << Outcome.KernelReplayTimeNs << " ns\n";
}
- outs() << TOOL_PREFIX << "Replay finished (verification skipped)\n";
+ // At this point, any verification done was successful.
+ if (VerifyOpt)
+ outs() << TOOL_PREFIX << " Replay done, device memory verified\n";
+ else
+ outs() << TOOL_PREFIX << " Replay done, verification skipped\n";
+
return Error::success();
}
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.
This PR extends the kernel replay tool to perform multiple replay repetitions on the same process. It also prints the execution time of the kernel replay, which includes the kernel launch and kernel synchronization (replay I/O time is excluded). Precise kernel timing should be obtained through the corresponding vendor tools.
The output report after recording has been improved as well.