Skip to content

[Offload][PGO] Fix PGO on NVPTX targets #143568

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions clang/lib/Driver/ToolChains/Clang.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6247,9 +6247,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
Args.AddLastArg(CmdArgs, options::OPT_fconvergent_functions,
options::OPT_fno_convergent_functions);

// NVPTX doesn't support PGO or coverage
if (!Triple.isNVPTX())
addPGOAndCoverageFlags(TC, C, JA, Output, Args, SanitizeArgs, CmdArgs);
addPGOAndCoverageFlags(TC, C, JA, Output, Args, SanitizeArgs, CmdArgs);

Args.AddLastArg(CmdArgs, options::OPT_fclang_abi_compat_EQ);

Expand Down
33 changes: 0 additions & 33 deletions clang/test/Driver/cuda-no-pgo-or-coverage.cu

This file was deleted.

6 changes: 6 additions & 0 deletions llvm/include/llvm/ProfileData/InstrProf.h
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,12 @@ inline StringRef getInstrProfRegFuncsName() {
return "__llvm_profile_register_functions";
}

/// Return the name of function that initializes self-referential datavar values
/// on NVPTX targets
inline StringRef getInstrProfDelayedInitFuncName() {
return "__llvm_profile_delayed_data_var_init";
}

/// Return the name of the runtime interface that registers per-function control
/// data for one instrumented function.
inline StringRef getInstrProfRegFuncName() {
Expand Down
80 changes: 74 additions & 6 deletions llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,13 @@ class InstrLowerer final {
/// Create INSTR_PROF_DATA variable for counters and bitmaps.
void createDataVariable(InstrProfCntrInstBase *Inc);

/// Creates delayed initialiation function for data relative offsets
/// This is only relevant on NVPTX targets where circular constant structures
/// are not allowed
bool
emitDataDelayedInit(SmallVector<Function *> &Kernels,
SmallVector<const InstrProfCntrInstBase *> &ValueSites);

/// Get the counters for virtual table values, creating them if necessary.
void getOrCreateVTableProfData(GlobalVariable *GV);

Expand Down Expand Up @@ -947,11 +954,18 @@ bool InstrLowerer::lower() {
if (!ContainsProfiling && !CoverageNamesVar)
return MadeChange;

// Cached info for generating delayed offset calculations
// This is only relevant on NVPTX targets
SmallVector<Function *> Kernels;
SmallVector<const InstrProfCntrInstBase *> ValueSites;
Comment on lines +957 to +960
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you remind me why we need this for NVPTX and not AMDGPU?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These values are used by emitDataDelayedInit to generate the bitmap and counter pointers. We only need them on NVPTX because circular initial values work fine on AMD: #94268 (comment).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, seems like a surprising amount of code though. I would've just expected it to split up the initializer somehow. I.e.

int x[] = {0, x[0]}

Could be

int dummy = 0;
int x[] = {dummy, dummy}; 

or something?


// We did not know how many value sites there would be inside
// the instrumented function. This is counting the number of instrumented
// target value sites to enter it as field in the profile data variable.
for (Function &F : M) {
InstrProfCntrInstBase *FirstProfInst = nullptr;
if (F.getCallingConv() == CallingConv::PTX_Kernel)
Kernels.push_back(&F);
for (BasicBlock &BB : F) {
for (auto I = BB.begin(), E = BB.end(); I != E; I++) {
if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(I))
Expand All @@ -971,9 +985,12 @@ bool InstrLowerer::lower() {
// Also create the data variable based on the MCDCParams.
if (FirstProfInst != nullptr) {
static_cast<void>(getOrCreateRegionCounters(FirstProfInst));
ValueSites.push_back(FirstProfInst);
}
}

MadeChange |= emitDataDelayedInit(Kernels, ValueSites);

if (EnableVTableValueProfiling)
for (GlobalVariable &GV : M.globals())
// Global variables with type metadata are virtual table variables.
Expand Down Expand Up @@ -1734,6 +1751,13 @@ InstrLowerer::getOrCreateRegionCounters(InstrProfCntrInstBase *Inc) {
return PD.RegionCounters;
}

// Calculates difference between two global variable addresses as an integer
Constant *globalVarDiff(Module &M, GlobalVariable *A, GlobalVariable *B) {
auto *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext());
return ConstantExpr::getSub(ConstantExpr::getPtrToInt(A, IntPtrTy),
ConstantExpr::getPtrToInt(B, IntPtrTy));
}

void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
// When debug information is correlated to profile data, a data variable
// is not needed.
Expand Down Expand Up @@ -1854,13 +1878,12 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
// Reference the counter variable with a label difference (link-time
// constant).
DataSectionKind = IPSK_data;
RelativeCounterPtr =
ConstantExpr::getSub(ConstantExpr::getPtrToInt(CounterPtr, IntPtrTy),
ConstantExpr::getPtrToInt(Data, IntPtrTy));
const Triple T(M.getTargetTriple());
RelativeCounterPtr = T.isNVPTX() ? ConstantInt::get(IntPtrTy, 0)
: globalVarDiff(M, CounterPtr, Data);
if (BitmapPtr != nullptr)
RelativeBitmapPtr =
ConstantExpr::getSub(ConstantExpr::getPtrToInt(BitmapPtr, IntPtrTy),
ConstantExpr::getPtrToInt(Data, IntPtrTy));
RelativeBitmapPtr = T.isNVPTX() ? ConstantInt::get(IntPtrTy, 0)
: globalVarDiff(M, BitmapPtr, Data);
}

Constant *DataVals[] = {
Expand All @@ -1887,6 +1910,51 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
ReferencedNames.push_back(NamePtr);
}

bool InstrLowerer::emitDataDelayedInit(
SmallVector<Function *> &Kernels,
SmallVector<const InstrProfCntrInstBase *> &ValueSites) {
const Triple T(M.getTargetTriple());
if (!T.isNVPTX() || ProfileCorrelate == InstrProfCorrelator::BINARY ||
Kernels.empty() || ValueSites.empty()) {
return false;
}

auto *VoidTy = Type::getVoidTy(M.getContext());
auto *Int32Ty = Type::getInt32Ty(M.getContext());
auto *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext());
auto *DelayedInitFTy = FunctionType::get(VoidTy, false);
auto *DelayedInitF =
Function::Create(DelayedInitFTy, GlobalValue::InternalLinkage,
getInstrProfDelayedInitFuncName(), M);

IRBuilder<> IRB(BasicBlock::Create(M.getContext(), "", DelayedInitF));

for (const auto *ValueSite : ValueSites) {
GlobalVariable *NamePtr = ValueSite->getName();
auto &PD = ProfileDataMap[NamePtr];
auto *RelativeCounter = globalVarDiff(M, PD.RegionCounters, PD.DataVar);
auto *RelativeCounterPtr =
IRB.CreateGEP(IntPtrTy, PD.DataVar, {ConstantInt::get(Int32Ty, 2)});
IRB.CreateStore(RelativeCounter, RelativeCounterPtr);
if (PD.RegionBitmaps != nullptr) {
auto *RelativeBitmap = globalVarDiff(M, PD.RegionBitmaps, PD.DataVar);
auto *RelativeBitmapPtr =
IRB.CreateGEP(IntPtrTy, PD.DataVar, {ConstantInt::get(Int32Ty, 3)});
IRB.CreateStore(RelativeBitmap, RelativeBitmapPtr);
}
}

IRB.CreateRetVoid();

for (auto *Kernel : Kernels) {
auto &KernelEntry = Kernel->getEntryBlock();
IRB.SetInsertPoint(KernelEntry.getFirstNonPHI());
IRB.CreateCall(DelayedInitF);
}

return true;
}

void InstrLowerer::emitVNodes() {
if (!ValueProfileStaticAlloc)
return;
Expand Down
2 changes: 1 addition & 1 deletion offload/test/offloading/gpupgo/pgo_device_and_host.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
// RUN: %target_triple.%basename_t.hfdi.profraw \
// RUN: | %fcheck-generic --check-prefix="LLVM-DEVICE"

// REQUIRES: amdgpu
// REQUIRES: gpu
// REQUIRES: pgo

int main() {
Expand Down
2 changes: 1 addition & 1 deletion offload/test/offloading/gpupgo/pgo_device_only.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
// RUN: %target_triple.%basename_t.clang.profraw | \
// RUN: %fcheck-generic --check-prefix="CLANG-PGO"

// REQUIRES: amdgpu
// REQUIRES: gpu
// REQUIRES: pgo

int test1(int a) { return a / 2; }
Expand Down
Loading