Skip to content

[flang][OpenMP] Map simple do concurrent loops to OpenMP host constructs #127633

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions flang/docs/DoConcurrentConversionToOpenMP.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,53 @@ see the "Data environment" section below.
See `flang/test/Transforms/DoConcurrent/loop_nest_test.f90` for more examples
of what is and is not detected as a perfect loop nest.

### Single-range loops

Given the following loop:
```fortran
do concurrent(i=1:n)
a(i) = i * i
end do
```

#### Mapping to `host`

Mapping this loop to the `host`, generates MLIR operations of the following
structure:

```
%4 = fir.address_of(@_QFEa) ...
%6:2 = hlfir.declare %4 ...

omp.parallel {
// Allocate private copy for `i`.
// TODO Use delayed privatization.
%19 = fir.alloca i32 {bindc_name = "i"}
%20:2 = hlfir.declare %19 {uniq_name = "_QFEi"} ...

omp.wsloop {
omp.loop_nest (%arg0) : index = (%21) to (%22) inclusive step (%c1_2) {
%23 = fir.convert %arg0 : (index) -> i32
// Use the privatized version of `i`.
fir.store %23 to %20#1 : !fir.ref<i32>
...

// Use "shared" SSA value of `a`.
%42 = hlfir.designate %6#0
hlfir.assign %35 to %42
...
omp.yield
}
omp.terminator
}
omp.terminator
}
```

#### Mapping to `device`

<!-- TODO -->

<!--
More details about current status will be added along with relevant parts of the
implementation in later upstreaming patches.
Expand Down
202 changes: 190 additions & 12 deletions flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "flang/Optimizer/OpenMP/Utils.h"
#include "mlir/Analysis/SliceAnalysis.h"
#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
#include "mlir/IR/IRMapping.h"
#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/RegionUtils.h"

Expand All @@ -24,7 +25,67 @@ namespace flangomp {

namespace {
namespace looputils {
using LoopNest = llvm::SetVector<fir::DoLoopOp>;
/// Stores info needed about the induction/iteration variable for each `do
/// concurrent` in a loop nest.
struct InductionVariableInfo {
/// The operation allocating memory for iteration variable.
mlir::Operation *iterVarMemDef;
};

using LoopNestToIndVarMap =
llvm::MapVector<fir::DoLoopOp, InductionVariableInfo>;

/// For the \p doLoop parameter, find the operation that declares its iteration
/// variable or allocates memory for it.
///
/// For example, give the following loop:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
/// For example, give the following loop:
/// For example, given the following loop:

/// ```
/// ...
/// %i:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : ...
/// ...
/// fir.do_loop %ind_var = %lb to %ub step %s unordered {
/// %ind_var_conv = fir.convert %ind_var : (index) -> i32
/// fir.store %ind_var_conv to %i#1 : !fir.ref<i32>
/// ...
/// }
/// ```
///
/// This function returns the `hlfir.declare` op for `%i`.
///
/// Note: The current implementation is dependent on how flang emits loop
/// bodies; which is sufficient for the current simple test/use cases. If this
/// proves to be insufficient, this should be made more generic.
mlir::Operation *findLoopIterationVarMemDecl(fir::DoLoopOp doLoop) {
mlir::Value result = nullptr;

// Checks if a StoreOp is updating the memref of the loop's iteration
// variable.
auto isStoringIV = [&](fir::StoreOp storeOp) {
// Direct store into the IV memref.
if (storeOp.getValue() == doLoop.getInductionVar())
return true;

// Indirect store into the IV memref.
if (auto convertOp = mlir::dyn_cast<fir::ConvertOp>(
storeOp.getValue().getDefiningOp())) {
if (convertOp.getOperand() == doLoop.getInductionVar())
return true;
}

return false;
};

for (mlir::Operation &op : doLoop) {
if (auto storeOp = mlir::dyn_cast<fir::StoreOp>(op))
if (isStoringIV(storeOp)) {
result = storeOp.getMemref();
break;
}
}

assert(result != nullptr && result.getDefiningOp() != nullptr);
return result.getDefiningOp();
}

/// Loop \p innerLoop is considered perfectly-nested inside \p outerLoop iff
/// there are no operations in \p outerloop's body other than:
Expand Down Expand Up @@ -116,11 +177,14 @@ bool isPerfectlyNested(fir::DoLoopOp outerLoop, fir::DoLoopOp innerLoop) {
/// fails to recognize a certain nested loop as part of the nest it just returns
/// the parent loops it discovered before.
mlir::LogicalResult collectLoopNest(fir::DoLoopOp currentLoop,
LoopNest &loopNest) {
LoopNestToIndVarMap &loopNest) {
assert(currentLoop.getUnordered());

while (true) {
loopNest.insert(currentLoop);
loopNest.insert(
{currentLoop,
InductionVariableInfo{findLoopIterationVarMemDecl(currentLoop)}});

llvm::SmallVector<fir::DoLoopOp> unorderedLoops;

for (auto nestedLoop : currentLoop.getRegion().getOps<fir::DoLoopOp>())
Expand Down Expand Up @@ -152,26 +216,140 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
public:
using mlir::OpConversionPattern<fir::DoLoopOp>::OpConversionPattern;

DoConcurrentConversion(mlir::MLIRContext *context, bool mapToDevice)
: OpConversionPattern(context), mapToDevice(mapToDevice) {}
DoConcurrentConversion(mlir::MLIRContext *context, bool mapToDevice,
llvm::DenseSet<fir::DoLoopOp> &concurrentLoopsToSkip)
: OpConversionPattern(context), mapToDevice(mapToDevice),
concurrentLoopsToSkip(concurrentLoopsToSkip) {}

mlir::LogicalResult
matchAndRewrite(fir::DoLoopOp doLoop, OpAdaptor adaptor,
mlir::ConversionPatternRewriter &rewriter) const override {
looputils::LoopNest loopNest;
if (mapToDevice)
return doLoop.emitError(
"not yet implemented: Mapping `do concurrent` loops to device");

looputils::LoopNestToIndVarMap loopNest;
bool hasRemainingNestedLoops =
failed(looputils::collectLoopNest(doLoop, loopNest));
if (hasRemainingNestedLoops)
mlir::emitWarning(doLoop.getLoc(),
"Some `do concurent` loops are not perfectly-nested. "
"These will be serialized.");

// TODO This will be filled in with the next PRs that upstreams the rest of
// the ROCm implementaion.
mlir::IRMapping mapper;
genParallelOp(doLoop.getLoc(), rewriter, loopNest, mapper);
mlir::omp::LoopNestOperands loopNestClauseOps;
genLoopNestClauseOps(doLoop.getLoc(), rewriter, loopNest, mapper,
loopNestClauseOps);

mlir::omp::LoopNestOp ompLoopNest =
genWsLoopOp(rewriter, loopNest.back().first, mapper, loopNestClauseOps,
/*isComposite=*/mapToDevice);

rewriter.eraseOp(doLoop);

// Mark `unordered` loops that are not perfectly nested to be skipped from
// the legality check of the `ConversionTarget` since we are not interested
// in mapping them to OpenMP.
ompLoopNest->walk([&](fir::DoLoopOp doLoop) {
if (doLoop.getUnordered()) {
concurrentLoopsToSkip.insert(doLoop);
}
});

return mlir::success();
}

private:
mlir::omp::ParallelOp genParallelOp(mlir::Location loc,
mlir::ConversionPatternRewriter &rewriter,
looputils::LoopNestToIndVarMap &loopNest,
mlir::IRMapping &mapper) const {
auto parallelOp = rewriter.create<mlir::omp::ParallelOp>(loc);
rewriter.createBlock(&parallelOp.getRegion());
rewriter.setInsertionPoint(rewriter.create<mlir::omp::TerminatorOp>(loc));

genLoopNestIndVarAllocs(rewriter, loopNest, mapper);
return parallelOp;
}

void genLoopNestIndVarAllocs(mlir::ConversionPatternRewriter &rewriter,
looputils::LoopNestToIndVarMap &loopNest,
mlir::IRMapping &mapper) const {

for (auto &[_, indVarInfo] : loopNest)
genInductionVariableAlloc(rewriter, indVarInfo.iterVarMemDef, mapper);
}

mlir::Operation *
genInductionVariableAlloc(mlir::ConversionPatternRewriter &rewriter,
mlir::Operation *indVarMemDef,
mlir::IRMapping &mapper) const {
assert(
indVarMemDef != nullptr &&
"Induction variable memdef is expected to have a defining operation.");

llvm::SmallSetVector<mlir::Operation *, 2> indVarDeclareAndAlloc;
for (auto operand : indVarMemDef->getOperands())
indVarDeclareAndAlloc.insert(operand.getDefiningOp());
indVarDeclareAndAlloc.insert(indVarMemDef);

mlir::Operation *result;
for (mlir::Operation *opToClone : indVarDeclareAndAlloc)
result = rewriter.clone(*opToClone, mapper);

return result;
}

void genLoopNestClauseOps(
mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
looputils::LoopNestToIndVarMap &loopNest, mlir::IRMapping &mapper,
mlir::omp::LoopNestOperands &loopNestClauseOps) const {
assert(loopNestClauseOps.loopLowerBounds.empty() &&
"Loop nest bounds were already emitted!");

auto populateBounds = [](mlir::Value var,
llvm::SmallVectorImpl<mlir::Value> &bounds) {
bounds.push_back(var.getDefiningOp()->getResult(0));
};

for (auto &[doLoop, _] : loopNest) {
populateBounds(doLoop.getLowerBound(), loopNestClauseOps.loopLowerBounds);
populateBounds(doLoop.getUpperBound(), loopNestClauseOps.loopUpperBounds);
populateBounds(doLoop.getStep(), loopNestClauseOps.loopSteps);
}

loopNestClauseOps.loopInclusive = rewriter.getUnitAttr();
}

mlir::omp::LoopNestOp
genWsLoopOp(mlir::ConversionPatternRewriter &rewriter, fir::DoLoopOp doLoop,
mlir::IRMapping &mapper,
const mlir::omp::LoopNestOperands &clauseOps,
bool isComposite) const {

auto wsloopOp = rewriter.create<mlir::omp::WsloopOp>(doLoop.getLoc());
wsloopOp.setComposite(isComposite);
rewriter.createBlock(&wsloopOp.getRegion());

auto loopNestOp =
rewriter.create<mlir::omp::LoopNestOp>(doLoop.getLoc(), clauseOps);

// Clone the loop's body inside the loop nest construct using the
// mapped values.
rewriter.cloneRegionBefore(doLoop.getRegion(), loopNestOp.getRegion(),
loopNestOp.getRegion().begin(), mapper);

mlir::Operation *terminator = loopNestOp.getRegion().back().getTerminator();
rewriter.setInsertionPointToEnd(&loopNestOp.getRegion().back());
rewriter.create<mlir::omp::YieldOp>(terminator->getLoc());
rewriter.eraseOp(terminator);

return loopNestOp;
}

bool mapToDevice;
llvm::DenseSet<fir::DoLoopOp> &concurrentLoopsToSkip;
};

class DoConcurrentConversionPass
Expand Down Expand Up @@ -200,24 +378,24 @@ class DoConcurrentConversionPass
return;
}

llvm::DenseSet<fir::DoLoopOp> concurrentLoopsToSkip;
mlir::RewritePatternSet patterns(context);
patterns.insert<DoConcurrentConversion>(
context, mapTo == flangomp::DoConcurrentMappingKind::DCMK_Device);
context, mapTo == flangomp::DoConcurrentMappingKind::DCMK_Device,
concurrentLoopsToSkip);
mlir::ConversionTarget target(*context);
target.addDynamicallyLegalOp<fir::DoLoopOp>([&](fir::DoLoopOp op) {
// The goal is to handle constructs that eventually get lowered to
// `fir.do_loop` with the `unordered` attribute (e.g. array expressions).
// Currently, this is only enabled for the `do concurrent` construct since
// the pass runs early in the pipeline.
return !op.getUnordered();
return !op.getUnordered() || concurrentLoopsToSkip.contains(op);
});
target.markUnknownOpDynamicallyLegal(
[](mlir::Operation *) { return true; });

if (mlir::failed(mlir::applyFullConversion(getOperation(), target,
std::move(patterns)))) {
mlir::emitError(mlir::UnknownLoc::get(context),
"error in converting do-concurrent op");
signalPassFailure();
}
}
Expand Down
29 changes: 29 additions & 0 deletions flang/test/Transforms/DoConcurrent/basic_device.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
// RUN: fir-opt --omp-do-concurrent-conversion="map-to=device" -verify-diagnostics %s

func.func @do_concurrent_basic() attributes {fir.bindc_name = "do_concurrent_basic"} {
%0 = fir.alloca i32 {bindc_name = "i"}
%1:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
%2 = fir.address_of(@_QFEa) : !fir.ref<!fir.array<10xi32>>
%c10 = arith.constant 10 : index
%3 = fir.shape %c10 : (index) -> !fir.shape<1>
%4:2 = hlfir.declare %2(%3) {uniq_name = "_QFEa"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
%c1_i32 = arith.constant 1 : i32
%7 = fir.convert %c1_i32 : (i32) -> index
%c10_i32 = arith.constant 10 : i32
%8 = fir.convert %c10_i32 : (i32) -> index
%c1 = arith.constant 1 : index

// expected-error@+2 {{not yet implemented: Mapping `do concurrent` loops to device}}
// expected-error@below {{failed to legalize operation 'fir.do_loop'}}
fir.do_loop %arg0 = %7 to %8 step %c1 unordered {
%13 = fir.convert %arg0 : (index) -> i32
fir.store %13 to %1#1 : !fir.ref<i32>
%14 = fir.load %1#0 : !fir.ref<i32>
%15 = fir.load %1#0 : !fir.ref<i32>
%16 = fir.convert %15 : (i32) -> i64
%17 = hlfir.designate %4#0 (%16) : (!fir.ref<!fir.array<10xi32>>, i64) -> !fir.ref<i32>
hlfir.assign %14 to %17 : i32, !fir.ref<i32>
}

return
}
16 changes: 6 additions & 10 deletions flang/test/Transforms/DoConcurrent/basic_host.f90
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@
! Mark as xfail for now until we upstream the relevant part. This is just for
! demo purposes at this point. Upstreaming this is the next step.
! XFAIL: *

! Tests mapping of a basic `do concurrent` loop to `!$omp parallel do`.

! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %s -o - \
Expand All @@ -19,21 +15,21 @@ program do_concurrent_basic

! CHECK-NOT: fir.do_loop

! CHECK: omp.parallel {

! CHECK-NEXT: %[[ITER_VAR:.*]] = fir.alloca i32 {bindc_name = "i"}
! CHECK-NEXT: %[[BINDING:.*]]:2 = hlfir.declare %[[ITER_VAR]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)

! CHECK: %[[C1:.*]] = arith.constant 1 : i32
! CHECK: %[[LB:.*]] = fir.convert %[[C1]] : (i32) -> index
! CHECK: %[[C10:.*]] = arith.constant 10 : i32
! CHECK: %[[UB:.*]] = fir.convert %[[C10]] : (i32) -> index
! CHECK: %[[STEP:.*]] = arith.constant 1 : index

! CHECK: omp.parallel {

! CHECK-NEXT: %[[ITER_VAR:.*]] = fir.alloca i32 {bindc_name = "i"}
! CHECK-NEXT: %[[BINDING:.*]]:2 = hlfir.declare %[[ITER_VAR]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)

! CHECK: omp.wsloop {
! CHECK-NEXT: omp.loop_nest (%[[ARG0:.*]]) : index = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
! CHECK-NEXT: %[[IV_IDX:.*]] = fir.convert %[[ARG0]] : (index) -> i32
! CHECK-NEXT: fir.store %[[IV_IDX]] to %[[BINDING]]#1 : !fir.ref<i32>
! CHECK-NEXT: fir.store %[[IV_IDX]] to %[[BINDING]]#0 : !fir.ref<i32>
! CHECK-NEXT: %[[IV_VAL1:.*]] = fir.load %[[BINDING]]#0 : !fir.ref<i32>
! CHECK-NEXT: %[[IV_VAL2:.*]] = fir.load %[[BINDING]]#0 : !fir.ref<i32>
! CHECK-NEXT: %[[IV_VAL_I64:.*]] = fir.convert %[[IV_VAL2]] : (i32) -> i64
Expand Down
Loading