-
Notifications
You must be signed in to change notification settings - Fork 13.9k
[Offload][PGO] Fix PGO on NVPTX targets #143568
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-pgo @llvm/pr-subscribers-offload Author: Ethan Luis McDonough (EthanLuisMcDonough) ChangesThis pull request aims to fix the issues with PGO on NVPTX targets outlined in #94268. This patch makes it so that the CounterPtr and BitmapPtr fields are initialized to zero and assigned values inside the Full diff: https://github.com/llvm/llvm-project/pull/143568.diff 6 Files Affected:
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 2d676aa0b9c8e..076ce44f4c9e3 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -6247,9 +6247,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
Args.AddLastArg(CmdArgs, options::OPT_fconvergent_functions,
options::OPT_fno_convergent_functions);
- // NVPTX doesn't support PGO or coverage
- if (!Triple.isNVPTX())
- addPGOAndCoverageFlags(TC, C, JA, Output, Args, SanitizeArgs, CmdArgs);
+ addPGOAndCoverageFlags(TC, C, JA, Output, Args, SanitizeArgs, CmdArgs);
Args.AddLastArg(CmdArgs, options::OPT_fclang_abi_compat_EQ);
diff --git a/clang/test/Driver/cuda-no-pgo-or-coverage.cu b/clang/test/Driver/cuda-no-pgo-or-coverage.cu
deleted file mode 100644
index b84587e1e182b..0000000000000
--- a/clang/test/Driver/cuda-no-pgo-or-coverage.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-// Check that profiling/coverage arguments doen't get passed down to device-side
-// compilation.
-//
-//
-// XRUN: not %clang -### --target=x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 \
-// XRUN: -fprofile-generate %s 2>&1 | \
-// XRUN: FileCheck --check-prefixes=CHECK,PROF %s
-//
-// RUN: not %clang -### --target=x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 \
-// RUN: -fprofile-instr-generate %s 2>&1 | \
-// RUN: FileCheck --check-prefixes=CHECK,PROF %s
-//
-// RUN: not %clang -### --target=x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 \
-// RUN: -coverage %s 2>&1 | \
-// RUN: FileCheck --check-prefixes=CHECK,GCOV %s
-//
-// RUN: not %clang -### --target=x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 \
-// RUN: -ftest-coverage %s 2>&1 | \
-// RUN: FileCheck --check-prefixes=CHECK,GCOV %s
-//
-// RUN: not %clang -### --target=x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 \
-// RUN: -fprofile-instr-generate -fcoverage-mapping %s 2>&1 | \
-// RUN: FileCheck --check-prefixes=CHECK,PROF %s
-//
-//
-// CHECK-NOT: error: unsupported option '-fprofile
-// CHECK-NOT: error: invalid argument
-// CHECK-DAG: "-fcuda-is-device"
-// CHECK-NOT: "-f{{[^"/]*coverage.*}}"
-// CHECK-NOT: "-fprofile{{[^"]*}}"
-// CHECK: "-triple" "x86_64-unknown-linux-gnu"
-// PROF: "-fprofile{{.*}}"
-// GCOV: "-coverage-notes-file=
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index f53602424b583..279ffab6431d4 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -171,6 +171,12 @@ inline StringRef getInstrProfRegFuncsName() {
return "__llvm_profile_register_functions";
}
+/// Return the name of function that initializes self-referential datavar values
+/// on NVPTX targets
+inline StringRef getInstrProfDelayedInitFuncName() {
+ return "__llvm_profile_delayed_data_var_init";
+}
+
/// Return the name of the runtime interface that registers per-function control
/// data for one instrumented function.
inline StringRef getInstrProfRegFuncName() {
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index fe3b0da33a009..3f619a6fd8dec 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -391,6 +391,13 @@ class InstrLowerer final {
/// Create INSTR_PROF_DATA variable for counters and bitmaps.
void createDataVariable(InstrProfCntrInstBase *Inc);
+ /// Creates delayed initialiation function for data relative offsets
+ /// This is only relevant on NVPTX targets where circular constant structures
+ /// are not allowed
+ bool
+ emitDataDelayedInit(SmallVector<Function *> &Kernels,
+ SmallVector<const InstrProfCntrInstBase *> &ValueSites);
+
/// Get the counters for virtual table values, creating them if necessary.
void getOrCreateVTableProfData(GlobalVariable *GV);
@@ -947,11 +954,18 @@ bool InstrLowerer::lower() {
if (!ContainsProfiling && !CoverageNamesVar)
return MadeChange;
+ // Cached info for generating delayed offset calculations
+ // This is only relevant on NVPTX targets
+ SmallVector<Function *> Kernels;
+ SmallVector<const InstrProfCntrInstBase *> ValueSites;
+
// We did not know how many value sites there would be inside
// the instrumented function. This is counting the number of instrumented
// target value sites to enter it as field in the profile data variable.
for (Function &F : M) {
InstrProfCntrInstBase *FirstProfInst = nullptr;
+ if (F.getCallingConv() == CallingConv::PTX_Kernel)
+ Kernels.push_back(&F);
for (BasicBlock &BB : F) {
for (auto I = BB.begin(), E = BB.end(); I != E; I++) {
if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(I))
@@ -971,9 +985,12 @@ bool InstrLowerer::lower() {
// Also create the data variable based on the MCDCParams.
if (FirstProfInst != nullptr) {
static_cast<void>(getOrCreateRegionCounters(FirstProfInst));
+ ValueSites.push_back(FirstProfInst);
}
}
+ MadeChange |= emitDataDelayedInit(Kernels, ValueSites);
+
if (EnableVTableValueProfiling)
for (GlobalVariable &GV : M.globals())
// Global variables with type metadata are virtual table variables.
@@ -1734,6 +1751,13 @@ InstrLowerer::getOrCreateRegionCounters(InstrProfCntrInstBase *Inc) {
return PD.RegionCounters;
}
+// Calculates difference between two global variable addresses as an integer
+Constant *globalVarDiff(Module &M, GlobalVariable *A, GlobalVariable *B) {
+ auto *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext());
+ return ConstantExpr::getSub(ConstantExpr::getPtrToInt(A, IntPtrTy),
+ ConstantExpr::getPtrToInt(B, IntPtrTy));
+}
+
void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
// When debug information is correlated to profile data, a data variable
// is not needed.
@@ -1854,13 +1878,12 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
// Reference the counter variable with a label difference (link-time
// constant).
DataSectionKind = IPSK_data;
- RelativeCounterPtr =
- ConstantExpr::getSub(ConstantExpr::getPtrToInt(CounterPtr, IntPtrTy),
- ConstantExpr::getPtrToInt(Data, IntPtrTy));
+ const Triple T(M.getTargetTriple());
+ RelativeCounterPtr = T.isNVPTX() ? ConstantInt::get(IntPtrTy, 0)
+ : globalVarDiff(M, CounterPtr, Data);
if (BitmapPtr != nullptr)
- RelativeBitmapPtr =
- ConstantExpr::getSub(ConstantExpr::getPtrToInt(BitmapPtr, IntPtrTy),
- ConstantExpr::getPtrToInt(Data, IntPtrTy));
+ RelativeBitmapPtr = T.isNVPTX() ? ConstantInt::get(IntPtrTy, 0)
+ : globalVarDiff(M, BitmapPtr, Data);
}
Constant *DataVals[] = {
@@ -1887,6 +1910,51 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
ReferencedNames.push_back(NamePtr);
}
+bool InstrLowerer::emitDataDelayedInit(
+ SmallVector<Function *> &Kernels,
+ SmallVector<const InstrProfCntrInstBase *> &ValueSites) {
+ const Triple T(M.getTargetTriple());
+ if (!T.isNVPTX() || ProfileCorrelate == InstrProfCorrelator::BINARY ||
+ Kernels.empty() || ValueSites.empty()) {
+ return false;
+ }
+
+ auto *VoidTy = Type::getVoidTy(M.getContext());
+ auto *Int32Ty = Type::getInt32Ty(M.getContext());
+ auto *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext());
+ auto *DelayedInitFTy = FunctionType::get(VoidTy, false);
+ auto *DelayedInitF =
+ Function::Create(DelayedInitFTy, GlobalValue::InternalLinkage,
+ getInstrProfDelayedInitFuncName(), M);
+
+ IRBuilder<> IRB(BasicBlock::Create(M.getContext(), "", DelayedInitF));
+
+ for (const auto *ValueSite : ValueSites) {
+ GlobalVariable *NamePtr = ValueSite->getName();
+ auto &PD = ProfileDataMap[NamePtr];
+ auto *RelativeCounter = globalVarDiff(M, PD.RegionCounters, PD.DataVar);
+ auto *RelativeCounterPtr =
+ IRB.CreateGEP(IntPtrTy, PD.DataVar, {ConstantInt::get(Int32Ty, 2)});
+ IRB.CreateStore(RelativeCounter, RelativeCounterPtr);
+ if (PD.RegionBitmaps != nullptr) {
+ auto *RelativeBitmap = globalVarDiff(M, PD.RegionBitmaps, PD.DataVar);
+ auto *RelativeBitmapPtr =
+ IRB.CreateGEP(IntPtrTy, PD.DataVar, {ConstantInt::get(Int32Ty, 3)});
+ IRB.CreateStore(RelativeBitmap, RelativeBitmapPtr);
+ }
+ }
+
+ IRB.CreateRetVoid();
+
+ for (auto *Kernel : Kernels) {
+ auto &KernelEntry = Kernel->getEntryBlock();
+ IRB.SetInsertPoint(KernelEntry.getFirstNonPHI());
+ IRB.CreateCall(DelayedInitF);
+ }
+
+ return true;
+}
+
void InstrLowerer::emitVNodes() {
if (!ValueProfileStaticAlloc)
return;
diff --git a/offload/test/offloading/gpupgo/pgo_device_and_host.c b/offload/test/offloading/gpupgo/pgo_device_and_host.c
index 3e95791ce9a50..68200d297f0fc 100644
--- a/offload/test/offloading/gpupgo/pgo_device_and_host.c
+++ b/offload/test/offloading/gpupgo/pgo_device_and_host.c
@@ -48,7 +48,7 @@
// RUN: %target_triple.%basename_t.hfdi.profraw \
// RUN: | %fcheck-generic --check-prefix="LLVM-DEVICE"
-// REQUIRES: amdgpu
+// REQUIRES: gpu
// REQUIRES: pgo
int main() {
diff --git a/offload/test/offloading/gpupgo/pgo_device_only.c b/offload/test/offloading/gpupgo/pgo_device_only.c
index 2939af613b6dd..f1221f1927716 100644
--- a/offload/test/offloading/gpupgo/pgo_device_only.c
+++ b/offload/test/offloading/gpupgo/pgo_device_only.c
@@ -14,7 +14,7 @@
// RUN: %target_triple.%basename_t.clang.profraw | \
// RUN: %fcheck-generic --check-prefix="CLANG-PGO"
-// REQUIRES: amdgpu
+// REQUIRES: gpu
// REQUIRES: pgo
int test1(int a) { return a / 2; }
|
@llvm/pr-subscribers-clang-driver Author: Ethan Luis McDonough (EthanLuisMcDonough) ChangesThis pull request aims to fix the issues with PGO on NVPTX targets outlined in #94268. This patch makes it so that the CounterPtr and BitmapPtr fields are initialized to zero and assigned values inside the Full diff: https://github.com/llvm/llvm-project/pull/143568.diff 6 Files Affected:
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 2d676aa0b9c8e..076ce44f4c9e3 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -6247,9 +6247,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
Args.AddLastArg(CmdArgs, options::OPT_fconvergent_functions,
options::OPT_fno_convergent_functions);
- // NVPTX doesn't support PGO or coverage
- if (!Triple.isNVPTX())
- addPGOAndCoverageFlags(TC, C, JA, Output, Args, SanitizeArgs, CmdArgs);
+ addPGOAndCoverageFlags(TC, C, JA, Output, Args, SanitizeArgs, CmdArgs);
Args.AddLastArg(CmdArgs, options::OPT_fclang_abi_compat_EQ);
diff --git a/clang/test/Driver/cuda-no-pgo-or-coverage.cu b/clang/test/Driver/cuda-no-pgo-or-coverage.cu
deleted file mode 100644
index b84587e1e182b..0000000000000
--- a/clang/test/Driver/cuda-no-pgo-or-coverage.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-// Check that profiling/coverage arguments doen't get passed down to device-side
-// compilation.
-//
-//
-// XRUN: not %clang -### --target=x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 \
-// XRUN: -fprofile-generate %s 2>&1 | \
-// XRUN: FileCheck --check-prefixes=CHECK,PROF %s
-//
-// RUN: not %clang -### --target=x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 \
-// RUN: -fprofile-instr-generate %s 2>&1 | \
-// RUN: FileCheck --check-prefixes=CHECK,PROF %s
-//
-// RUN: not %clang -### --target=x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 \
-// RUN: -coverage %s 2>&1 | \
-// RUN: FileCheck --check-prefixes=CHECK,GCOV %s
-//
-// RUN: not %clang -### --target=x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 \
-// RUN: -ftest-coverage %s 2>&1 | \
-// RUN: FileCheck --check-prefixes=CHECK,GCOV %s
-//
-// RUN: not %clang -### --target=x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 \
-// RUN: -fprofile-instr-generate -fcoverage-mapping %s 2>&1 | \
-// RUN: FileCheck --check-prefixes=CHECK,PROF %s
-//
-//
-// CHECK-NOT: error: unsupported option '-fprofile
-// CHECK-NOT: error: invalid argument
-// CHECK-DAG: "-fcuda-is-device"
-// CHECK-NOT: "-f{{[^"/]*coverage.*}}"
-// CHECK-NOT: "-fprofile{{[^"]*}}"
-// CHECK: "-triple" "x86_64-unknown-linux-gnu"
-// PROF: "-fprofile{{.*}}"
-// GCOV: "-coverage-notes-file=
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index f53602424b583..279ffab6431d4 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -171,6 +171,12 @@ inline StringRef getInstrProfRegFuncsName() {
return "__llvm_profile_register_functions";
}
+/// Return the name of function that initializes self-referential datavar values
+/// on NVPTX targets
+inline StringRef getInstrProfDelayedInitFuncName() {
+ return "__llvm_profile_delayed_data_var_init";
+}
+
/// Return the name of the runtime interface that registers per-function control
/// data for one instrumented function.
inline StringRef getInstrProfRegFuncName() {
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index fe3b0da33a009..3f619a6fd8dec 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -391,6 +391,13 @@ class InstrLowerer final {
/// Create INSTR_PROF_DATA variable for counters and bitmaps.
void createDataVariable(InstrProfCntrInstBase *Inc);
+ /// Creates delayed initialiation function for data relative offsets
+ /// This is only relevant on NVPTX targets where circular constant structures
+ /// are not allowed
+ bool
+ emitDataDelayedInit(SmallVector<Function *> &Kernels,
+ SmallVector<const InstrProfCntrInstBase *> &ValueSites);
+
/// Get the counters for virtual table values, creating them if necessary.
void getOrCreateVTableProfData(GlobalVariable *GV);
@@ -947,11 +954,18 @@ bool InstrLowerer::lower() {
if (!ContainsProfiling && !CoverageNamesVar)
return MadeChange;
+ // Cached info for generating delayed offset calculations
+ // This is only relevant on NVPTX targets
+ SmallVector<Function *> Kernels;
+ SmallVector<const InstrProfCntrInstBase *> ValueSites;
+
// We did not know how many value sites there would be inside
// the instrumented function. This is counting the number of instrumented
// target value sites to enter it as field in the profile data variable.
for (Function &F : M) {
InstrProfCntrInstBase *FirstProfInst = nullptr;
+ if (F.getCallingConv() == CallingConv::PTX_Kernel)
+ Kernels.push_back(&F);
for (BasicBlock &BB : F) {
for (auto I = BB.begin(), E = BB.end(); I != E; I++) {
if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(I))
@@ -971,9 +985,12 @@ bool InstrLowerer::lower() {
// Also create the data variable based on the MCDCParams.
if (FirstProfInst != nullptr) {
static_cast<void>(getOrCreateRegionCounters(FirstProfInst));
+ ValueSites.push_back(FirstProfInst);
}
}
+ MadeChange |= emitDataDelayedInit(Kernels, ValueSites);
+
if (EnableVTableValueProfiling)
for (GlobalVariable &GV : M.globals())
// Global variables with type metadata are virtual table variables.
@@ -1734,6 +1751,13 @@ InstrLowerer::getOrCreateRegionCounters(InstrProfCntrInstBase *Inc) {
return PD.RegionCounters;
}
+// Calculates difference between two global variable addresses as an integer
+Constant *globalVarDiff(Module &M, GlobalVariable *A, GlobalVariable *B) {
+ auto *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext());
+ return ConstantExpr::getSub(ConstantExpr::getPtrToInt(A, IntPtrTy),
+ ConstantExpr::getPtrToInt(B, IntPtrTy));
+}
+
void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
// When debug information is correlated to profile data, a data variable
// is not needed.
@@ -1854,13 +1878,12 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
// Reference the counter variable with a label difference (link-time
// constant).
DataSectionKind = IPSK_data;
- RelativeCounterPtr =
- ConstantExpr::getSub(ConstantExpr::getPtrToInt(CounterPtr, IntPtrTy),
- ConstantExpr::getPtrToInt(Data, IntPtrTy));
+ const Triple T(M.getTargetTriple());
+ RelativeCounterPtr = T.isNVPTX() ? ConstantInt::get(IntPtrTy, 0)
+ : globalVarDiff(M, CounterPtr, Data);
if (BitmapPtr != nullptr)
- RelativeBitmapPtr =
- ConstantExpr::getSub(ConstantExpr::getPtrToInt(BitmapPtr, IntPtrTy),
- ConstantExpr::getPtrToInt(Data, IntPtrTy));
+ RelativeBitmapPtr = T.isNVPTX() ? ConstantInt::get(IntPtrTy, 0)
+ : globalVarDiff(M, BitmapPtr, Data);
}
Constant *DataVals[] = {
@@ -1887,6 +1910,51 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
ReferencedNames.push_back(NamePtr);
}
+bool InstrLowerer::emitDataDelayedInit(
+ SmallVector<Function *> &Kernels,
+ SmallVector<const InstrProfCntrInstBase *> &ValueSites) {
+ const Triple T(M.getTargetTriple());
+ if (!T.isNVPTX() || ProfileCorrelate == InstrProfCorrelator::BINARY ||
+ Kernels.empty() || ValueSites.empty()) {
+ return false;
+ }
+
+ auto *VoidTy = Type::getVoidTy(M.getContext());
+ auto *Int32Ty = Type::getInt32Ty(M.getContext());
+ auto *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext());
+ auto *DelayedInitFTy = FunctionType::get(VoidTy, false);
+ auto *DelayedInitF =
+ Function::Create(DelayedInitFTy, GlobalValue::InternalLinkage,
+ getInstrProfDelayedInitFuncName(), M);
+
+ IRBuilder<> IRB(BasicBlock::Create(M.getContext(), "", DelayedInitF));
+
+ for (const auto *ValueSite : ValueSites) {
+ GlobalVariable *NamePtr = ValueSite->getName();
+ auto &PD = ProfileDataMap[NamePtr];
+ auto *RelativeCounter = globalVarDiff(M, PD.RegionCounters, PD.DataVar);
+ auto *RelativeCounterPtr =
+ IRB.CreateGEP(IntPtrTy, PD.DataVar, {ConstantInt::get(Int32Ty, 2)});
+ IRB.CreateStore(RelativeCounter, RelativeCounterPtr);
+ if (PD.RegionBitmaps != nullptr) {
+ auto *RelativeBitmap = globalVarDiff(M, PD.RegionBitmaps, PD.DataVar);
+ auto *RelativeBitmapPtr =
+ IRB.CreateGEP(IntPtrTy, PD.DataVar, {ConstantInt::get(Int32Ty, 3)});
+ IRB.CreateStore(RelativeBitmap, RelativeBitmapPtr);
+ }
+ }
+
+ IRB.CreateRetVoid();
+
+ for (auto *Kernel : Kernels) {
+ auto &KernelEntry = Kernel->getEntryBlock();
+ IRB.SetInsertPoint(KernelEntry.getFirstNonPHI());
+ IRB.CreateCall(DelayedInitF);
+ }
+
+ return true;
+}
+
void InstrLowerer::emitVNodes() {
if (!ValueProfileStaticAlloc)
return;
diff --git a/offload/test/offloading/gpupgo/pgo_device_and_host.c b/offload/test/offloading/gpupgo/pgo_device_and_host.c
index 3e95791ce9a50..68200d297f0fc 100644
--- a/offload/test/offloading/gpupgo/pgo_device_and_host.c
+++ b/offload/test/offloading/gpupgo/pgo_device_and_host.c
@@ -48,7 +48,7 @@
// RUN: %target_triple.%basename_t.hfdi.profraw \
// RUN: | %fcheck-generic --check-prefix="LLVM-DEVICE"
-// REQUIRES: amdgpu
+// REQUIRES: gpu
// REQUIRES: pgo
int main() {
diff --git a/offload/test/offloading/gpupgo/pgo_device_only.c b/offload/test/offloading/gpupgo/pgo_device_only.c
index 2939af613b6dd..f1221f1927716 100644
--- a/offload/test/offloading/gpupgo/pgo_device_only.c
+++ b/offload/test/offloading/gpupgo/pgo_device_only.c
@@ -14,7 +14,7 @@
// RUN: %target_triple.%basename_t.clang.profraw | \
// RUN: %fcheck-generic --check-prefix="CLANG-PGO"
-// REQUIRES: amdgpu
+// REQUIRES: gpu
// REQUIRES: pgo
int test1(int a) { return a / 2; }
|
// Cached info for generating delayed offset calculations | ||
// This is only relevant on NVPTX targets | ||
SmallVector<Function *> Kernels; | ||
SmallVector<const InstrProfCntrInstBase *> ValueSites; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you remind me why we need this for NVPTX and not AMDGPU?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These values are used by emitDataDelayedInit
to generate the bitmap and counter pointers. We only need them on NVPTX because circular initial values work fine on AMD: #94268 (comment).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right, seems like a surprising amount of code though. I would've just expected it to split up the initializer somehow. I.e.
int x[] = {0, x[0]}
Could be
int dummy = 0;
int x[] = {dummy, dummy};
or something?
This pull request aims to fix the issues with PGO on NVPTX targets outlined in #94268. This patch makes it so that the CounterPtr and BitmapPtr fields are initialized to zero and assigned values inside the
__llvm_profile_delayed_data_var_init
function on NVIDIA GPUs. This function is inserted at the start of each kernel function.