From ff6fb08849cf27b7ad3e5d2775f62f2a7408a727 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Thu, 22 May 2025 22:20:04 -0500 Subject: [PATCH] [Offload][PGO] Fix PGO on NVPTX targets --- clang/lib/Driver/ToolChains/Clang.cpp | 4 +- clang/test/Driver/cuda-no-pgo-or-coverage.cu | 33 -------- llvm/include/llvm/ProfileData/InstrProf.h | 6 ++ .../Instrumentation/InstrProfiling.cpp | 80 +++++++++++++++++-- .../offloading/gpupgo/pgo_device_and_host.c | 2 +- .../test/offloading/gpupgo/pgo_device_only.c | 2 +- 6 files changed, 83 insertions(+), 44 deletions(-) delete mode 100644 clang/test/Driver/cuda-no-pgo-or-coverage.cu diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 2d676aa0b9c8e..076ce44f4c9e3 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -6247,9 +6247,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, Args.AddLastArg(CmdArgs, options::OPT_fconvergent_functions, options::OPT_fno_convergent_functions); - // NVPTX doesn't support PGO or coverage - if (!Triple.isNVPTX()) - addPGOAndCoverageFlags(TC, C, JA, Output, Args, SanitizeArgs, CmdArgs); + addPGOAndCoverageFlags(TC, C, JA, Output, Args, SanitizeArgs, CmdArgs); Args.AddLastArg(CmdArgs, options::OPT_fclang_abi_compat_EQ); diff --git a/clang/test/Driver/cuda-no-pgo-or-coverage.cu b/clang/test/Driver/cuda-no-pgo-or-coverage.cu deleted file mode 100644 index b84587e1e182b..0000000000000 --- a/clang/test/Driver/cuda-no-pgo-or-coverage.cu +++ /dev/null @@ -1,33 +0,0 @@ -// Check that profiling/coverage arguments doen't get passed down to device-side -// compilation. -// -// -// XRUN: not %clang -### --target=x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 \ -// XRUN: -fprofile-generate %s 2>&1 | \ -// XRUN: FileCheck --check-prefixes=CHECK,PROF %s -// -// RUN: not %clang -### --target=x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 \ -// RUN: -fprofile-instr-generate %s 2>&1 | \ -// RUN: FileCheck --check-prefixes=CHECK,PROF %s -// -// RUN: not %clang -### --target=x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 \ -// RUN: -coverage %s 2>&1 | \ -// RUN: FileCheck --check-prefixes=CHECK,GCOV %s -// -// RUN: not %clang -### --target=x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 \ -// RUN: -ftest-coverage %s 2>&1 | \ -// RUN: FileCheck --check-prefixes=CHECK,GCOV %s -// -// RUN: not %clang -### --target=x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 \ -// RUN: -fprofile-instr-generate -fcoverage-mapping %s 2>&1 | \ -// RUN: FileCheck --check-prefixes=CHECK,PROF %s -// -// -// CHECK-NOT: error: unsupported option '-fprofile -// CHECK-NOT: error: invalid argument -// CHECK-DAG: "-fcuda-is-device" -// CHECK-NOT: "-f{{[^"/]*coverage.*}}" -// CHECK-NOT: "-fprofile{{[^"]*}}" -// CHECK: "-triple" "x86_64-unknown-linux-gnu" -// PROF: "-fprofile{{.*}}" -// GCOV: "-coverage-notes-file= diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h index f53602424b583..279ffab6431d4 100644 --- a/llvm/include/llvm/ProfileData/InstrProf.h +++ b/llvm/include/llvm/ProfileData/InstrProf.h @@ -171,6 +171,12 @@ inline StringRef getInstrProfRegFuncsName() { return "__llvm_profile_register_functions"; } +/// Return the name of function that initializes self-referential datavar values +/// on NVPTX targets +inline StringRef getInstrProfDelayedInitFuncName() { + return "__llvm_profile_delayed_data_var_init"; +} + /// Return the name of the runtime interface that registers per-function control /// data for one instrumented function. inline StringRef getInstrProfRegFuncName() { diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index fe3b0da33a009..3f619a6fd8dec 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -391,6 +391,13 @@ class InstrLowerer final { /// Create INSTR_PROF_DATA variable for counters and bitmaps. void createDataVariable(InstrProfCntrInstBase *Inc); + /// Creates delayed initialiation function for data relative offsets + /// This is only relevant on NVPTX targets where circular constant structures + /// are not allowed + bool + emitDataDelayedInit(SmallVector &Kernels, + SmallVector &ValueSites); + /// Get the counters for virtual table values, creating them if necessary. void getOrCreateVTableProfData(GlobalVariable *GV); @@ -947,11 +954,18 @@ bool InstrLowerer::lower() { if (!ContainsProfiling && !CoverageNamesVar) return MadeChange; + // Cached info for generating delayed offset calculations + // This is only relevant on NVPTX targets + SmallVector Kernels; + SmallVector ValueSites; + // We did not know how many value sites there would be inside // the instrumented function. This is counting the number of instrumented // target value sites to enter it as field in the profile data variable. for (Function &F : M) { InstrProfCntrInstBase *FirstProfInst = nullptr; + if (F.getCallingConv() == CallingConv::PTX_Kernel) + Kernels.push_back(&F); for (BasicBlock &BB : F) { for (auto I = BB.begin(), E = BB.end(); I != E; I++) { if (auto *Ind = dyn_cast(I)) @@ -971,9 +985,12 @@ bool InstrLowerer::lower() { // Also create the data variable based on the MCDCParams. if (FirstProfInst != nullptr) { static_cast(getOrCreateRegionCounters(FirstProfInst)); + ValueSites.push_back(FirstProfInst); } } + MadeChange |= emitDataDelayedInit(Kernels, ValueSites); + if (EnableVTableValueProfiling) for (GlobalVariable &GV : M.globals()) // Global variables with type metadata are virtual table variables. @@ -1734,6 +1751,13 @@ InstrLowerer::getOrCreateRegionCounters(InstrProfCntrInstBase *Inc) { return PD.RegionCounters; } +// Calculates difference between two global variable addresses as an integer +Constant *globalVarDiff(Module &M, GlobalVariable *A, GlobalVariable *B) { + auto *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext()); + return ConstantExpr::getSub(ConstantExpr::getPtrToInt(A, IntPtrTy), + ConstantExpr::getPtrToInt(B, IntPtrTy)); +} + void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { // When debug information is correlated to profile data, a data variable // is not needed. @@ -1854,13 +1878,12 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { // Reference the counter variable with a label difference (link-time // constant). DataSectionKind = IPSK_data; - RelativeCounterPtr = - ConstantExpr::getSub(ConstantExpr::getPtrToInt(CounterPtr, IntPtrTy), - ConstantExpr::getPtrToInt(Data, IntPtrTy)); + const Triple T(M.getTargetTriple()); + RelativeCounterPtr = T.isNVPTX() ? ConstantInt::get(IntPtrTy, 0) + : globalVarDiff(M, CounterPtr, Data); if (BitmapPtr != nullptr) - RelativeBitmapPtr = - ConstantExpr::getSub(ConstantExpr::getPtrToInt(BitmapPtr, IntPtrTy), - ConstantExpr::getPtrToInt(Data, IntPtrTy)); + RelativeBitmapPtr = T.isNVPTX() ? ConstantInt::get(IntPtrTy, 0) + : globalVarDiff(M, BitmapPtr, Data); } Constant *DataVals[] = { @@ -1887,6 +1910,51 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { ReferencedNames.push_back(NamePtr); } +bool InstrLowerer::emitDataDelayedInit( + SmallVector &Kernels, + SmallVector &ValueSites) { + const Triple T(M.getTargetTriple()); + if (!T.isNVPTX() || ProfileCorrelate == InstrProfCorrelator::BINARY || + Kernels.empty() || ValueSites.empty()) { + return false; + } + + auto *VoidTy = Type::getVoidTy(M.getContext()); + auto *Int32Ty = Type::getInt32Ty(M.getContext()); + auto *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext()); + auto *DelayedInitFTy = FunctionType::get(VoidTy, false); + auto *DelayedInitF = + Function::Create(DelayedInitFTy, GlobalValue::InternalLinkage, + getInstrProfDelayedInitFuncName(), M); + + IRBuilder<> IRB(BasicBlock::Create(M.getContext(), "", DelayedInitF)); + + for (const auto *ValueSite : ValueSites) { + GlobalVariable *NamePtr = ValueSite->getName(); + auto &PD = ProfileDataMap[NamePtr]; + auto *RelativeCounter = globalVarDiff(M, PD.RegionCounters, PD.DataVar); + auto *RelativeCounterPtr = + IRB.CreateGEP(IntPtrTy, PD.DataVar, {ConstantInt::get(Int32Ty, 2)}); + IRB.CreateStore(RelativeCounter, RelativeCounterPtr); + if (PD.RegionBitmaps != nullptr) { + auto *RelativeBitmap = globalVarDiff(M, PD.RegionBitmaps, PD.DataVar); + auto *RelativeBitmapPtr = + IRB.CreateGEP(IntPtrTy, PD.DataVar, {ConstantInt::get(Int32Ty, 3)}); + IRB.CreateStore(RelativeBitmap, RelativeBitmapPtr); + } + } + + IRB.CreateRetVoid(); + + for (auto *Kernel : Kernels) { + auto &KernelEntry = Kernel->getEntryBlock(); + IRB.SetInsertPoint(KernelEntry.getFirstNonPHI()); + IRB.CreateCall(DelayedInitF); + } + + return true; +} + void InstrLowerer::emitVNodes() { if (!ValueProfileStaticAlloc) return; diff --git a/offload/test/offloading/gpupgo/pgo_device_and_host.c b/offload/test/offloading/gpupgo/pgo_device_and_host.c index 3e95791ce9a50..68200d297f0fc 100644 --- a/offload/test/offloading/gpupgo/pgo_device_and_host.c +++ b/offload/test/offloading/gpupgo/pgo_device_and_host.c @@ -48,7 +48,7 @@ // RUN: %target_triple.%basename_t.hfdi.profraw \ // RUN: | %fcheck-generic --check-prefix="LLVM-DEVICE" -// REQUIRES: amdgpu +// REQUIRES: gpu // REQUIRES: pgo int main() { diff --git a/offload/test/offloading/gpupgo/pgo_device_only.c b/offload/test/offloading/gpupgo/pgo_device_only.c index 2939af613b6dd..f1221f1927716 100644 --- a/offload/test/offloading/gpupgo/pgo_device_only.c +++ b/offload/test/offloading/gpupgo/pgo_device_only.c @@ -14,7 +14,7 @@ // RUN: %target_triple.%basename_t.clang.profraw | \ // RUN: %fcheck-generic --check-prefix="CLANG-PGO" -// REQUIRES: amdgpu +// REQUIRES: gpu // REQUIRES: pgo int test1(int a) { return a / 2; }