Skip to content
Merged
114 changes: 100 additions & 14 deletions llvm/lib/Transforms/Scalar/LoopInterchange.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringMap.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSet.h"
#include "llvm/Analysis/DependenceAnalysis.h"
#include "llvm/Analysis/LoopCacheAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
Expand Down Expand Up @@ -70,6 +70,13 @@ namespace {

using LoopVector = SmallVector<Loop *, 8>;

/// A list of direction vectors. Each entry represents a direction vector
/// corresponding to one or more dependencies existing in the loop nest. The
/// length of all direction vectors is equal and is N + 1, where N is the depth
/// of the loop nest. The first N elements correspond to the dependency
/// direction of each N loops. The last one indicates whether this entry is
/// forward dependency ('<') or not ('*'). The term "forward" aligns with what
/// is defined in LoopAccessAnalysis.
// TODO: Check if we can use a sparse matrix here.
using CharMatrix = std::vector<std::vector<char>>;

Expand Down Expand Up @@ -126,11 +133,33 @@ static bool noDuplicateRulesAndIgnore(ArrayRef<RuleTy> Rules) {

static void printDepMatrix(CharMatrix &DepMatrix) {
for (auto &Row : DepMatrix) {
for (auto D : Row)
// Drop the last element because it is a flag indicating whether this is
// forward dependency or not, which doesn't affect the legality check.
for (char D : drop_end(Row))
LLVM_DEBUG(dbgs() << D << " ");
LLVM_DEBUG(dbgs() << "\n");
}
}

/// Return true if \p Src appears before \p Dst in the same basic block.
/// Precondition: \p Src and \Dst are distinct instructions within the same
/// basic block.
static bool inThisOrder(const Instruction *Src, const Instruction *Dst) {
assert(Src->getParent() == Dst->getParent() && Src != Dst &&
"Expected Src and Dst to be different instructions in the same BB");

bool FoundSrc = false;
for (const Instruction &I : *(Src->getParent())) {
if (&I == Src) {
FoundSrc = true;
continue;
}
if (&I == Dst)
return FoundSrc;
}

llvm_unreachable("Dst not found");
}
#endif

static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
Expand Down Expand Up @@ -174,7 +203,10 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
return false;
}
ValueVector::iterator I, IE, J, JE;
StringSet<> Seen;

// Manage direction vectors that are already seen. Map each direction vector
// to an index of DepMatrix at which it is stored.
StringMap<unsigned> Seen;

for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
for (J = I, JE = MemInstr.end(); J != JE; ++J) {
Expand Down Expand Up @@ -228,9 +260,49 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
Dep.push_back('I');
}

// Test whether the dependency is forward or not.
bool IsKnownForward = true;
if (Src->getParent() != Dst->getParent()) {
// In general, when Src and Dst are in different BBs, the execution
// order of them within a single iteration is not guaranteed. Treat
// conservatively as not-forward dependency in this case.
IsKnownForward = false;
} else {
// Src and Dst are in the same BB. If they are the different
// instructions, Src should appear before Dst in the BB as they are
// stored to MemInstr in that order.
assert((Src == Dst || inThisOrder(Src, Dst)) &&
"Unexpected instructions");

// If the Dependence object is reversed (due to normalization), it
// represents the dependency from Dst to Src, meaning it is a backward
// dependency. Otherwise it should be a forward dependency.
bool IsReversed = D->getSrc() != Src;
if (IsReversed)
IsKnownForward = false;
}

// Initialize the last element. Assume forward dependencies only; it
// will be updated later if there is any non-forward dependency.
Dep.push_back('<');

// The last element should express the "summary" among one or more
// direction vectors whose first N elements are the same (where N is
// the depth of the loop nest). Hence we exclude the last element from
// the Seen map.
auto [Ite, Inserted] = Seen.try_emplace(
StringRef(Dep.data(), Dep.size() - 1), DepMatrix.size());

// Make sure we only add unique entries to the dependency matrix.
if (Seen.insert(StringRef(Dep.data(), Dep.size())).second)
if (Inserted)
DepMatrix.push_back(Dep);

// If we cannot prove that this dependency is forward, change the last
// element of the corresponding entry. Since a `[... *]` dependency
// includes a `[... <]` dependency, we do not need to keep both and
// change the existing entry instead.
if (!IsKnownForward)
DepMatrix[Ite->second].back() = '*';
}
}
}
Expand Down Expand Up @@ -281,11 +353,12 @@ static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
continue;

// Check if the direction vector is lexicographically positive (or zero)
// for both before/after exchanged.
if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size()) == false)
// for both before/after exchanged. Ignore the last element because it
// doesn't affect the legality.
if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
return false;
std::swap(Cur[InnerLoopId], Cur[OuterLoopId]);
if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size()) == false)
if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
return false;
}
return true;
Expand Down Expand Up @@ -1334,22 +1407,35 @@ LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) {
for (const auto &Dep : DepMatrix) {
char Dir = Dep[LoopId];
if (Dir != 'I' && Dir != '=')
return false;
char DepType = Dep.back();
assert((DepType == '<' || DepType == '*') &&
"Unexpected element in dependency vector");

// There are no loop-carried dependencies.
if (Dir == '=' || Dir == 'I')
continue;

// DepType being '<' means that this direction vector represents a forward
// dependency. In principle, a loop with '<' direction can be vectorized in
// this case.
if (Dir == '<' && DepType == '<')
continue;

// We cannot prove that the loop is vectorizable.
return false;
}
return true;
}

std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
// If the outer loop is not loop independent it is not profitable to move
// this to inner position, since doing so would not enable inner loop
// parallelism.
// If the outer loop cannot be vectorized, it is not profitable to move this
// to inner position.
if (!canVectorize(DepMatrix, OuterLoopId))
return false;

// If inner loop has dependence and outer loop is loop independent then it is
// profitable to interchange to enable inner loop parallelism.
// If the inner loop cannot be vectorized but the outer loop can be, then it
// is profitable to interchange to enable inner loop parallelism.
if (!canVectorize(DepMatrix, InnerLoopId))
return true;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
@A = dso_local global [256 x [256 x float]] zeroinitializer
@B = dso_local global [256 x [256 x float]] zeroinitializer
@C = dso_local global [256 x [256 x float]] zeroinitializer
@D = global [256 x [256 x [256 x float]]] zeroinitializer
@E = global [256 x [256 x [256 x float]]] zeroinitializer

; Check that the below loops are exchanged for vectorization.
;
Expand Down Expand Up @@ -64,15 +66,13 @@ exit:
; for (int j = 1; j < 256; j++)
; A[i][j-1] = A[i][j] + B[i][j];
;
; FIXME: These loops are exchanged at this time due to the problem in
; profitability heuristic calculation for vectorization.

; CHECK: --- !Passed
; CHECK: --- !Missed
; CHECK-NEXT: Pass: loop-interchange
; CHECK-NEXT: Name: Interchanged
; CHECK-NEXT: Name: InterchangeNotProfitable
; CHECK-NEXT: Function: interchange_unnecesasry_for_vectorization
; CHECK-NEXT: Args:
; CHECK-NEXT: - String: Loop interchanged with enclosing loop.
; CHECK-NEXT: - String: Insufficient information to calculate the cost of loop for interchange.
define void @interchange_unnecesasry_for_vectorization() {
entry:
br label %for.i.header
Expand Down Expand Up @@ -103,3 +103,134 @@ for.i.inc:
exit:
ret void
}

; Check that the below loops are exchanged to allow innermost loop
; vectorization. We cannot vectorize the j-loop because it has a lexically
; backward dependency, but the i-loop can be vectorized because all the
; loop-carried dependencies are lexically forward. LoopVectorize currently only
; vectorizes innermost loop, hence move the i-loop to that position.
;
; for (int i = 0; i < 255; i++) {
; for (int j = 1; j < 256; j++) {
; A[i][j] = A[i][j-1] + B[i][j];
; C[i][j] += C[i+1][j];
; }
; }
;

; CHECK: --- !Passed
; CHECK-NEXT: Pass: loop-interchange
; CHECK-NEXT: Name: Interchanged
; CHECK-NEXT: Function: interchange_necessary_for_vectorization2
; CHECK-NEXT: Args:
; CHECK-NEXT: - String: Loop interchanged with enclosing loop.
define void @interchange_necessary_for_vectorization2() {
entry:
br label %for.i.header

for.i.header:
%i = phi i64 [ 1, %entry ], [ %i.next, %for.i.inc ]
%i.inc = add i64 %i, 1
br label %for.j.body

for.j.body:
%j = phi i64 [ 1, %for.i.header ], [ %j.next, %for.j.body ]
%j.dec = add i64 %j, -1
%a.load.index = getelementptr [256 x [256 x float]], ptr @A, i64 0, i64 %i, i64 %j.dec
%b.index = getelementptr [256 x [256 x float]], ptr @B, i64 0, i64 %i, i64 %j
%c.load.index = getelementptr [256 x [256 x float]], ptr @C, i64 0, i64 %i.inc, i64 %j
%c.store.index = getelementptr [256 x [256 x float]], ptr @C, i64 0, i64 %i, i64 %j
%a = load float, ptr %a.load.index
%b = load float, ptr %b.index
%c0 = load float, ptr %c.load.index
%c1 = load float, ptr %c.store.index
%add.0 = fadd float %a, %b
%a.store.index = getelementptr [256 x [256 x float]], ptr @A, i64 0, i64 %i, i64 %j
store float %add.0, ptr %a.store.index
%add.1 = fadd float %c0, %c1
store float %add.1, ptr %c.store.index
%j.next = add i64 %j, 1
%cmp.j = icmp eq i64 %j.next, 256
br i1 %cmp.j, label %for.i.inc, label %for.j.body

for.i.inc:
%i.next = add i64 %i, 1
%cmp.i = icmp eq i64 %i.next, 255
br i1 %cmp.i, label %exit, label %for.i.header

exit:
ret void
}

; Check that no interchange is performed for the following loop. Interchanging
; the j-loop and k-loop makes the innermost loop vectorizble, since the j-loop
; has only forward dependencies. However, at the moment, a loop body consisting
; of multiple BBs is handled pesimistically. Hence the j-loop isn't moved to
; the innermost place.
;
; for (int i = 0; i < 255; i++) {
; for (int j = 0; j < 255; j++) {
; for (int k = 0; k < 128; k++) {
; E[i][j][k] = D[i+1][j+1][2*k];
; if (cond)
; D[i][j][k+1] = 1.0;
; }
; }

; CHECK: --- !Missed
; CHECK-NEXT: Pass: loop-interchange
; CHECK-NEXT: Name: InterchangeNotProfitable
; CHECK-NEXT: Function: multiple_BBs_in_loop
; CHECK-NEXT: Args:
; CHECK-NEXT: - String: Interchanging loops is not considered to improve cache locality nor vectorization.
; CHECK: --- !Missed
; CHECK-NEXT: Pass: loop-interchange
; CHECK-NEXT: Name: InterchangeNotProfitable
; CHECK-NEXT: Function: multiple_BBs_in_loop
; CHECK-NEXT: Args:
; CHECK-NEXT: - String: Interchanging loops is not considered to improve cache locality nor vectorization.
define void @multiple_BBs_in_loop() {
entry:
br label %for.i.header

for.i.header:
%i = phi i64 [ 0, %entry ], [ %i.inc, %for.i.inc ]
%i.inc = add i64 %i, 1
br label %for.j.header

for.j.header:
%j = phi i64 [ 0, %for.i.header ], [ %j.inc, %for.j.inc ]
%j.inc = add i64 %j, 1
br label %for.k.body

for.k.body:
%k = phi i64 [ 0, %for.j.header ], [ %k.inc, %for.k.inc ]
%k.inc = add i64 %k, 1
%k.2 = mul i64 %k, 2
%d.index = getelementptr [256 x [256 x [256 x float]]], ptr @D, i64 0, i64 %i.inc, i64 %j.inc, i64 %k.2
%e.index = getelementptr [256 x [256 x [256 x float]]], ptr @E, i64 0, i64 %i, i64 %j, i64 %k
%d.load = load float, ptr %d.index
store float %d.load, ptr %e.index
%cond = freeze i1 undef
br i1 %cond, label %if.then, label %for.k.inc

if.then:
%d.index2 = getelementptr [256 x [256 x [256 x float]]], ptr @D, i64 0, i64 %i, i64 %j, i64 %k.inc
store float 1.0, ptr %d.index2
br label %for.k.inc

for.k.inc:
%cmp.k = icmp eq i64 %k.inc, 128
br i1 %cmp.k, label %for.j.inc, label %for.k.body

for.j.inc:
%cmp.j = icmp eq i64 %j.inc, 255
br i1 %cmp.j, label %for.i.inc, label %for.j.header

for.i.inc:
%cmp.i = icmp eq i64 %i.inc, 255
br i1 %cmp.i, label %exit, label %for.i.header

exit:
ret void
}