-
Notifications
You must be signed in to change notification settings - Fork 13.4k
[OpenMP][OpenMPIRBuilder] Move copyInput to a passed in lambda function and re-order kernel argument load/stores #68124
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-flang-openmp @llvm/pr-subscribers-mlir-openmp ChangesThis patch moves the existing copyInput function This allows more flexibility in how the function The idea is to eventually replace/build on For now this should be an NFC as far as Full diff: https://github.com/llvm/llvm-project/pull/68124.diff 4 Files Affected:
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 75da461cfd8d95e..e3f1cddb72fa03d 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -2166,6 +2166,9 @@ class OpenMPIRBuilder {
using TargetBodyGenCallbackTy = function_ref<InsertPointTy(
InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
+ using TargetGenArgAccessorsCallbackTy = function_ref<Value *(
+ Argument &Arg, Value *Input, IRBuilderBase &Builder)>;
+
/// Generator for '#omp target'
///
/// \param Loc where the target data construct was encountered.
@@ -2177,6 +2180,8 @@ class OpenMPIRBuilder {
/// \param Inputs The input values to the region that will be passed.
/// as arguments to the outlined function.
/// \param BodyGenCB Callback that will generate the region code.
+ /// \param ArgAccessorFuncCB Callback that will generate accessors
+ /// instructions for passed in target arguments where neccessary
InsertPointTy createTarget(const LocationDescription &Loc,
OpenMPIRBuilder::InsertPointTy AllocaIP,
OpenMPIRBuilder::InsertPointTy CodeGenIP,
@@ -2184,7 +2189,8 @@ class OpenMPIRBuilder {
int32_t NumThreads,
SmallVectorImpl<Value *> &Inputs,
GenMapInfoCallbackTy GenMapInfoCB,
- TargetBodyGenCallbackTy BodyGenCB);
+ TargetBodyGenCallbackTy BodyGenCB,
+ TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB);
/// Returns __kmpc_for_static_init_* runtime function for the specified
/// size \a IVSize and sign \a IVSigned. Will create a distribute call
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 72e1af55fe63f60..bcb5d07bbf7edc8 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4539,25 +4539,11 @@ FunctionCallee OpenMPIRBuilder::createDispatchFiniFunction(unsigned IVSize,
return getOrCreateRuntimeFunction(M, Name);
}
-// Copy input from pointer or i64 to the expected argument type.
-static Value *copyInput(IRBuilderBase &Builder, unsigned AddrSpace,
- Value *Input, Argument &Arg) {
- auto Addr = Builder.CreateAlloca(Arg.getType()->isPointerTy()
- ? Arg.getType()
- : Type::getInt64Ty(Builder.getContext()),
- AddrSpace);
- auto AddrAscast =
- Builder.CreatePointerBitCastOrAddrSpaceCast(Addr, Input->getType());
- Builder.CreateStore(&Arg, AddrAscast);
- auto Copy = Builder.CreateLoad(Arg.getType(), AddrAscast);
-
- return Copy;
-}
-
-static Function *
-createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
- StringRef FuncName, SmallVectorImpl<Value *> &Inputs,
- OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc) {
+static Function *createOutlinedFunction(
+ OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, StringRef FuncName,
+ SmallVectorImpl<Value *> &Inputs,
+ OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc,
+ OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) {
SmallVector<Type *> ParameterTypes;
if (OMPBuilder.Config.isTargetDevice()) {
// All parameters to target devices are passed as pointers
@@ -4603,12 +4589,7 @@ createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
Value *Input = std::get<0>(InArg);
Argument &Arg = std::get<1>(InArg);
- Value *InputCopy =
- OMPBuilder.Config.isTargetDevice()
- ? copyInput(Builder,
- OMPBuilder.M.getDataLayout().getAllocaAddrSpace(),
- Input, Arg)
- : &Arg;
+ Value *InputCopy = ArgAccessorFuncCB(Arg, Input, Builder);
// Collect all the instructions
for (User *User : make_early_inc_range(Input->users()))
@@ -4623,18 +4604,19 @@ createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
return Func;
}
-static void
-emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
- TargetRegionEntryInfo &EntryInfo,
- Function *&OutlinedFn, Constant *&OutlinedFnID,
- int32_t NumTeams, int32_t NumThreads,
- SmallVectorImpl<Value *> &Inputs,
- OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc) {
+static void emitTargetOutlinedFunction(
+ OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
+ TargetRegionEntryInfo &EntryInfo, Function *&OutlinedFn,
+ Constant *&OutlinedFnID, int32_t NumTeams, int32_t NumThreads,
+ SmallVectorImpl<Value *> &Inputs,
+ OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc,
+ OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) {
OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
- [&OMPBuilder, &Builder, &Inputs, &CBFunc](StringRef EntryFnName) {
+ [&OMPBuilder, &Builder, &Inputs, &CBFunc,
+ &ArgAccessorFuncCB](StringRef EntryFnName) {
return createOutlinedFunction(OMPBuilder, Builder, EntryFnName, Inputs,
- CBFunc);
+ CBFunc, ArgAccessorFuncCB);
};
OMPBuilder.emitTargetRegionFunction(EntryInfo, GenerateOutlinedFunction,
@@ -4698,7 +4680,9 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTarget(
const LocationDescription &Loc, InsertPointTy AllocaIP,
InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, int32_t NumTeams,
int32_t NumThreads, SmallVectorImpl<Value *> &Args,
- GenMapInfoCallbackTy GenMapInfoCB, TargetBodyGenCallbackTy CBFunc) {
+ GenMapInfoCallbackTy GenMapInfoCB,
+ OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc,
+ OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB) {
if (!updateToLocation(Loc))
return InsertPointTy();
@@ -4707,7 +4691,8 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTarget(
Function *OutlinedFn;
Constant *OutlinedFnID;
emitTargetOutlinedFunction(*this, Builder, EntryInfo, OutlinedFn,
- OutlinedFnID, NumTeams, NumThreads, Args, CBFunc);
+ OutlinedFnID, NumTeams, NumThreads, Args, CBFunc,
+ ArgAccessorFuncCB);
if (!Config.isTargetDevice())
emitTargetCall(*this, Builder, AllocaIP, OutlinedFn, OutlinedFnID, NumTeams,
NumThreads, Args, GenMapInfoCB);
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index fd524f6067ee0ea..81733f1a2790287 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -5236,6 +5236,24 @@ TEST_F(OpenMPIRBuilderTest, TargetRegion) {
Inputs.push_back(BPtr);
Inputs.push_back(CPtr);
+ auto SimpleArgAccessorCB = [&OMPBuilder](llvm::Argument &Arg,
+ llvm::Value *Input,
+ IRBuilderBase &Builder) {
+ if (!OMPBuilder.Config.isTargetDevice())
+ return cast<llvm::Value>(&Arg);
+
+ llvm::Value *Addr = Builder.CreateAlloca(
+ Arg.getType()->isPointerTy() ? Arg.getType()
+ : Type::getInt64Ty(Builder.getContext()),
+ OMPBuilder.M.getDataLayout().getAllocaAddrSpace());
+ llvm::Value *AddrAscast =
+ Builder.CreatePointerBitCastOrAddrSpaceCast(Addr, Input->getType());
+ Builder.CreateStore(&Arg, AddrAscast);
+ llvm::Value *Copy = Builder.CreateLoad(Arg.getType(), AddrAscast);
+
+ return Copy;
+ };
+
llvm::OpenMPIRBuilder::MapInfosTy CombinedInfos;
auto GenMapInfoCB = [&](llvm::OpenMPIRBuilder::InsertPointTy codeGenIP)
-> llvm::OpenMPIRBuilder::MapInfosTy & {
@@ -5245,9 +5263,9 @@ TEST_F(OpenMPIRBuilderTest, TargetRegion) {
TargetRegionEntryInfo EntryInfo("func", 42, 4711, 17);
OpenMPIRBuilder::LocationDescription OmpLoc({Builder.saveIP(), DL});
- Builder.restoreIP(OMPBuilder.createTarget(OmpLoc, Builder.saveIP(),
- Builder.saveIP(), EntryInfo, -1, 0,
- Inputs, GenMapInfoCB, BodyGenCB));
+ Builder.restoreIP(OMPBuilder.createTarget(
+ OmpLoc, Builder.saveIP(), Builder.saveIP(), EntryInfo, -1, 0, Inputs,
+ GenMapInfoCB, BodyGenCB, SimpleArgAccessorCB));
OMPBuilder.finalize();
Builder.CreateRetVoid();
@@ -5301,6 +5319,23 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) {
Constant::getNullValue(PointerType::get(Ctx, 0)),
Constant::getNullValue(PointerType::get(Ctx, 0))};
+ auto SimpleArgAccessorCB = [&](llvm::Argument &Arg, llvm::Value *Input,
+ IRBuilderBase &Builder) {
+ if (!OMPBuilder.Config.isTargetDevice())
+ return cast<llvm::Value>(&Arg);
+
+ llvm::Value *Addr = Builder.CreateAlloca(
+ Arg.getType()->isPointerTy() ? Arg.getType()
+ : Type::getInt64Ty(Builder.getContext()),
+ OMPBuilder.M.getDataLayout().getAllocaAddrSpace());
+ llvm::Value *AddrAscast =
+ Builder.CreatePointerBitCastOrAddrSpaceCast(Addr, Input->getType());
+ Builder.CreateStore(&Arg, AddrAscast);
+ llvm::Value *Copy = Builder.CreateLoad(Arg.getType(), AddrAscast);
+
+ return Copy;
+ };
+
llvm::OpenMPIRBuilder::MapInfosTy CombinedInfos;
auto GenMapInfoCB = [&](llvm::OpenMPIRBuilder::InsertPointTy codeGenIP)
-> llvm::OpenMPIRBuilder::MapInfosTy & {
@@ -5322,9 +5357,10 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) {
TargetRegionEntryInfo EntryInfo("parent", /*DeviceID=*/1, /*FileID=*/2,
/*Line=*/3, /*Count=*/0);
- Builder.restoreIP(OMPBuilder.createTarget(
- Loc, EntryIP, EntryIP, EntryInfo, /*NumTeams=*/-1,
- /*NumThreads=*/0, CapturedArgs, GenMapInfoCB, BodyGenCB));
+ Builder.restoreIP(
+ OMPBuilder.createTarget(Loc, EntryIP, EntryIP, EntryInfo, /*NumTeams=*/-1,
+ /*NumThreads=*/0, CapturedArgs, GenMapInfoCB,
+ BodyGenCB, SimpleArgAccessorCB));
Builder.CreateRetVoid();
OMPBuilder.finalize();
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 14bcbc3018f72bd..8326a20a15c4629 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1993,6 +1993,27 @@ handleDeclareTargetMapVar(llvm::ArrayRef<Value> mapOperands,
}
}
+static llvm::Value *
+createDeviceArgumentAccessor(llvm::Argument &arg, llvm::Value *input,
+ llvm::IRBuilderBase &builder,
+ llvm::OpenMPIRBuilder &ompBuilder,
+ LLVM::ModuleTranslation &moduleTranslation) {
+ if (!ompBuilder.Config.isTargetDevice())
+ return cast<llvm::Value>(&arg);
+
+ llvm::Value *addr =
+ builder.CreateAlloca(arg.getType()->isPointerTy()
+ ? arg.getType()
+ : llvm::Type::getInt64Ty(builder.getContext()),
+ ompBuilder.M.getDataLayout().getAllocaAddrSpace());
+ llvm::Value *addrAscast =
+ builder.CreatePointerBitCastOrAddrSpaceCast(addr, input->getType());
+ builder.CreateStore(&arg, addrAscast);
+ llvm::Value *copy = builder.CreateLoad(arg.getType(), addrAscast);
+
+ return copy;
+}
+
static LogicalResult
convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
LLVM::ModuleTranslation &moduleTranslation) {
@@ -2084,9 +2105,26 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
return combinedInfos;
};
+ auto argAccessorCB = [&moduleTranslation](llvm::Argument &arg,
+ llvm::Value *input,
+ llvm::IRBuilderBase &builder) {
+ llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
+
+ // We just return the unaltered argument for the host function
+ // for now, some alterations may be required in the future to
+ // keep host fallback functions working identically to the device
+ // version (e.g. pass ByCopy values should be treated as such on
+ // host and device, currently not always the case)
+ if (!ompBuilder->Config.isTargetDevice())
+ return cast<llvm::Value>(&arg);
+
+ return createDeviceArgumentAccessor(arg, input, builder, *ompBuilder,
+ moduleTranslation);
+ };
+
builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createTarget(
ompLoc, allocaIP, builder.saveIP(), entryInfo, defaultValTeams,
- defaultValThreads, inputs, genMapInfoCB, bodyCB));
+ defaultValThreads, inputs, genMapInfoCB, bodyCB, argAccessorCB));
// Remap access operations to declare target reference pointers for the
// device, essentially generating extra loadop's as necessary
|
@@ -2166,6 +2166,9 @@ class OpenMPIRBuilder { | |||
using TargetBodyGenCallbackTy = function_ref<InsertPointTy( | |||
InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>; | |||
|
|||
using TargetGenArgAccessorsCallbackTy = function_ref<Value *( | |||
Argument &Arg, Value *Input, IRBuilderBase &Builder)>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it necessary to pass the builder here
? I see that a local builder is already available in places this is defined.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, it appears to be necessary, the OpenMPIRBuilder has it's own LLVM builder that it utilises internally which will end up with a different InsertionPoint as the lowering of createTarget progresses, All we currently do is pass an initial insertion point to createTarget and then the internal OpenMPIRBuilder's LLVM builder takes this insertion point and progresses it and then returns the end insertion point back up to the original ModuleTranslation's LLVM builder so it can continue working from wherever the OpenMPIRBuilder left off! That's how I understand it at least. So using the local copy by reference will result in inserting the new instructions at the incorrect location as it won't ever actually be used by the OpenMPIRBuilder.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think passing around the builder is the coorect approach to solve this.
We should be passing and returning the InsertPoints instead. We should pass the AllocaIP and CodeGenIP to the Callback, and the CB should return the new position of the IP.
If new allocas are not needed the we can perhaps ignore the AllocaIP.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think the AllocaIP is currently used, the insertion point that's used at the moment is the first non-phi or debug node, which does result in slightly differing IR to Clang. But I'll utilise the insertion points pass in method and try to get it the IR closer to what Clang currently has.
My only worry of using two differing builders is if they have differing states, but I'm not sure if that is a concern, perhaps the builders only relevant state is the insertion points.
@@ -5301,6 +5319,23 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) { | |||
Constant::getNullValue(PointerType::get(Ctx, 0)), | |||
Constant::getNullValue(PointerType::get(Ctx, 0))}; | |||
|
|||
auto SimpleArgAccessorCB = [&](llvm::Argument &Arg, llvm::Value *Input, | |||
IRBuilderBase &Builder) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this callback also take an AllocaIP? Without that, we make it rely on the caller to make sure builder is set at the right IP.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's currently only ever called by createOutlinedFunction, which does pre-set the insertion point (and currently doesn't use the AllocaIP which results in slightly differing IR to Clang currently) and takes a lot of ownership of where that insertion point is at the moment, so this mainly just keeps the status quo that exists right now.
However, I am more than happy to allow it to take an IP or perhaps two, to make the IR from MLIR vs Clang look a little more identical and this would also allow me to utilise the ModuleTranslation builder as @TIFitis suggests
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think differing IRs is a problem as long as they are correct. I have not worked much on target side, so I am okay with the current implementation and would also be okay with adding an IP, whatever works best.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah it's not a big deal, just makes diff'ng things easier, and means there's less chance of breakage in the future if we stay consistent.
Sounds good, thank you for your input, and thank you as well @TIFitis
Updated to utilise the InsertPoint style interface for the lambda, hopefully it's a little more inline with what is expected for the CallbackInterfaces! Please let me know if it's not and if there's any other changes you'd like made. I also tried to generate output a little more like Clang, with the Allocas/Store preamble for the argument in the entry block and the loads in the user block. You can see what I mean a bit more tangibly with the altered tests. If we're not comfortable with that I can simply use the same InsertPoint for loads/allocas/stores as before. However, this change doesn't affect any runtime tests we have at the moment. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM 😄
Thank you very much for the review guys! I'll give @jsjodin sometime to review it if he wishes as he wrote it originally, and incase @jdoerfert wishes to give any input as well. Neither are required if you do not wish to spend the time reviewing though (as I'm aware everyone's time is precious and people are likely preparing to go to the conference next week), but I'll leave the patch open until Friday afternoon/evening (UK time) just incase! |
…on and re-order kernel argument load/stores This patch moves the existing copyInput function into a lambda argument that can be defined by a caller to the function. This allows more flexibility in how the function is defined, allowing Clang and MLIR to utilise their own respective functions and types inside of the lamba without affecting the OMPIRBuilder itself. The idea is to eventually replace/build on the existing copyInput function that's used and moved into OpenMPToLLVMIRTranslation.cpp to a slightly more complex implementation that uses MLIRs map information (primarily ByRef and ByCapture information at the moment). The patch also moves kernel load stores to the top of the kernel, prior to the first openmp runtime invocation. Just makes the IR a little closer to Clang.
e4ce1cb
to
8f88cb7
Compare
This patch moves the existing copyInput function
into a lambda argument that can be defined
by a caller to the function.
This allows more flexibility in how the function
is defined, allowing Clang and MLIR to utilise
their own respective functions and types inside
of the lamba without affecting the OMPIRBuilder
itself.
The idea is to eventually replace/build on
the existing copyInput function that's used
and moved into OpenMPToLLVMIRTranslation.cpp
to a slightly more complex implementation
that uses MLIRs map information (primarily
ByRef and ByCapture information at the
moment).
The patch also moves kernel load stores to the top
of the kernel, prior to the first openmp runtime
invocation. Just makes the IR a little closer to Clang.