Skip to content

[LoopInterchange] Improve profitability check for vectorization #133672

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 66 additions & 24 deletions llvm/lib/Transforms/Scalar/LoopInterchange.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringMap.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSet.h"
#include "llvm/Analysis/DependenceAnalysis.h"
Expand Down Expand Up @@ -126,7 +127,8 @@ static void printDepMatrix(CharMatrix &DepMatrix) {
}
#endif

static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
static bool populateDependencyMatrix(CharMatrix &DepMatrix,
BitVector &IsNegatedVec, unsigned Level,
Loop *L, DependenceInfo *DI,
ScalarEvolution *SE,
OptimizationRemarkEmitter *ORE) {
Expand Down Expand Up @@ -167,7 +169,9 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
return false;
}
ValueVector::iterator I, IE, J, JE;
StringSet<> Seen;

// Manage all found direction vectors, negated and not negated, separately.
StringSet<> Seen[2];

for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
for (J = I, JE = MemInstr.end(); J != JE; ++J) {
Expand All @@ -182,7 +186,8 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
assert(D->isOrdered() && "Expected an output, flow or anti dep.");
// If the direction vector is negative, normalize it to
// make it non-negative.
if (D->normalize(SE))
bool Normalized = D->normalize(SE);
if (Normalized)
LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n");
LLVM_DEBUG(StringRef DepType =
D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
Expand Down Expand Up @@ -214,8 +219,12 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
}

// Make sure we only add unique entries to the dependency matrix.
if (Seen.insert(StringRef(Dep.data(), Dep.size())).second)
// Negated vectors (due to normalization) are treated as separate from
// non negated ones.
if (Seen[Normalized].insert(StringRef(Dep.data(), Dep.size())).second) {
DepMatrix.push_back(Dep);
IsNegatedVec.push_back(Normalized);
}
}
}
}
Expand Down Expand Up @@ -399,7 +408,7 @@ class LoopInterchangeProfitability {
/// Check if the loop interchange is profitable.
bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
unsigned InnerLoopId, unsigned OuterLoopId,
CharMatrix &DepMatrix,
CharMatrix &DepMatrix, const BitVector &IsNegatedVec,
const DenseMap<const Loop *, unsigned> &CostMap,
std::unique_ptr<CacheCost> &CC);

Expand All @@ -409,9 +418,10 @@ class LoopInterchangeProfitability {
const DenseMap<const Loop *, unsigned> &CostMap,
std::unique_ptr<CacheCost> &CC);
std::optional<bool> isProfitablePerInstrOrderCost();
std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId,
unsigned OuterLoopId,
CharMatrix &DepMatrix);
std::optional<bool>
isProfitableForVectorization(unsigned InnerLoopId, unsigned OuterLoopId,
CharMatrix &DepMatrix,
const BitVector &IsNegatedVec);
Loop *OuterLoop;
Loop *InnerLoop;

Expand Down Expand Up @@ -503,8 +513,9 @@ struct LoopInterchange {
<< "\n");

CharMatrix DependencyMatrix;
BitVector IsNegatedVec;
Loop *OuterMostLoop = *(LoopList.begin());
if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
if (!populateDependencyMatrix(DependencyMatrix, IsNegatedVec, LoopNestDepth,
OuterMostLoop, DI, SE, ORE)) {
LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
return false;
Expand Down Expand Up @@ -543,8 +554,8 @@ struct LoopInterchange {
for (unsigned j = SelecLoopId; j > 0; j--) {
bool ChangedPerIter = false;
for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
bool Interchanged =
processLoop(LoopList, i, i - 1, DependencyMatrix, CostMap);
bool Interchanged = processLoop(LoopList, i, i - 1, DependencyMatrix,
IsNegatedVec, CostMap);
ChangedPerIter |= Interchanged;
Changed |= Interchanged;
}
Expand All @@ -559,6 +570,7 @@ struct LoopInterchange {
bool processLoop(SmallVectorImpl<Loop *> &LoopList, unsigned InnerLoopId,
unsigned OuterLoopId,
std::vector<std::vector<char>> &DependencyMatrix,
BitVector &IsNegatedVec,
const DenseMap<const Loop *, unsigned> &CostMap) {
Loop *OuterLoop = LoopList[OuterLoopId];
Loop *InnerLoop = LoopList[InnerLoopId];
Expand All @@ -572,7 +584,7 @@ struct LoopInterchange {
LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
DependencyMatrix, CostMap, CC)) {
DependencyMatrix, IsNegatedVec, CostMap, CC)) {
LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
return false;
}
Expand Down Expand Up @@ -1197,27 +1209,57 @@ LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
return std::nullopt;
}

static char flipDirection(char Dir) {
switch (Dir) {
case '<':
return '>';
case '>':
return '<';
case '=':
case 'I':
case '*':
return Dir;
default:
llvm_unreachable("Unknown direction");
}
}

/// Return true if we can vectorize the loop specified by \p LoopId.
static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) {
static bool canVectorize(const CharMatrix &DepMatrix,
const BitVector &IsNegatedVec, unsigned LoopId) {
// The loop can be vectorized if there are no negative dependencies. Consider
// the dependency of `j` in the following example.
//
// Positive: ... = A[i][j] Negative: ... = A[i][j-1]
// A[i][j-1] = ... A[i][j] = ...
//
// In the right case, vectorizing the loop can change the loaded value from
// `A[i][j-1]`. At the moment we don't take into account the distance of the
// dependency and vector width.
// TODO: Considering the dependency distance and the vector width can give a
// more accurate result. For example, the following loop can be vectorized if
// the vector width is less than or equal to 4 x sizeof(A[0][0]).
for (unsigned I = 0; I != DepMatrix.size(); I++) {
char Dir = DepMatrix[I][LoopId];
if (Dir != 'I' && Dir != '=')
if (IsNegatedVec[I])
Dir = flipDirection(Dir);
if (Dir != '=' && Dir != 'I' && Dir != '<')
return false;
}
return true;
}

std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
// If the outer loop is not loop independent it is not profitable to move
// this to inner position, since doing so would not enable inner loop
// parallelism.
if (!canVectorize(DepMatrix, OuterLoopId))
unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix,
const BitVector &IsNegatedVec) {
// If the outer loop cannot be vectorized, it is not profitable to move this
// to inner position.
if (!canVectorize(DepMatrix, IsNegatedVec, OuterLoopId))
return false;

// If inner loop has dependence and outer loop is loop independent then it is
// If inner loop cannot be vectorized and outer loop can be then it is
// profitable to interchange to enable inner loop parallelism.
if (!canVectorize(DepMatrix, InnerLoopId))
if (!canVectorize(DepMatrix, IsNegatedVec, InnerLoopId))
return true;

// If both the inner and the outer loop can be vectorized, it is necessary to
Expand All @@ -1230,7 +1272,7 @@ std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(

bool LoopInterchangeProfitability::isProfitable(
const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
unsigned OuterLoopId, CharMatrix &DepMatrix,
unsigned OuterLoopId, CharMatrix &DepMatrix, const BitVector &IsNegatedVec,
const DenseMap<const Loop *, unsigned> &CostMap,
std::unique_ptr<CacheCost> &CC) {
// isProfitable() is structured to avoid endless loop interchange. If the
Expand All @@ -1252,8 +1294,8 @@ bool LoopInterchangeProfitability::isProfitable(
shouldInterchange = isProfitablePerInstrOrderCost();
break;
case RuleTy::ForVectorization:
shouldInterchange =
isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
shouldInterchange = isProfitableForVectorization(InnerLoopId, OuterLoopId,
DepMatrix, IsNegatedVec);
break;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,15 +64,13 @@ exit:
; for (int j = 1; j < 256; j++)
; A[i][j-1] = A[i][j] + B[i][j];
;
; FIXME: These loops are exchanged at this time due to the problem in
; profitability heuristic calculation for vectorization.

; CHECK: --- !Passed
; CHECK: --- !Missed
; CHECK-NEXT: Pass: loop-interchange
; CHECK-NEXT: Name: Interchanged
; CHECK-NEXT: Name: InterchangeNotProfitable
; CHECK-NEXT: Function: interchange_unnecesasry_for_vectorization
; CHECK-NEXT: Args:
; CHECK-NEXT: - String: Loop interchanged with enclosing loop.
; CHECK-NEXT: - String: Insufficient information to calculate the cost of loop for interchange.
define void @interchange_unnecesasry_for_vectorization() {
entry:
br label %for.i.header
Expand Down Expand Up @@ -103,3 +101,59 @@ for.i.inc:
exit:
ret void
}

; Check that the below loops are exchanged to allow innermost loop
; vectorization. We cannot vectorize the j-loop because it has negative
; distance dependency, but the i-loop can be vectorized.
;
; for (int i = 0; i < 255; i++) {
; for (int j = 1; j < 256; j++) {
; A[i][j] = A[i][j-1] + B[i][j];
; C[i][j] += C[i+1][j];
; }
; }
;

; CHECK: --- !Passed
; CHECK-NEXT: Pass: loop-interchange
; CHECK-NEXT: Name: Interchanged
; CHECK-NEXT: Function: interchange_necessary_for_vectorization2
; CHECK-NEXT: Args:
; CHECK-NEXT: - String: Loop interchanged with enclosing loop.
define void @interchange_necessary_for_vectorization2() {
entry:
br label %for.i.header

for.i.header:
%i = phi i64 [ 1, %entry ], [ %i.next, %for.i.inc ]
%i.inc = add nsw i64 %i, 1
br label %for.j.body

for.j.body:
%j = phi i64 [ 1, %for.i.header ], [ %j.next, %for.j.body ]
%j.dec = add nsw i64 %j, -1
%a.load.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @A, i64 %i, i64 %j.dec
%b.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @B, i64 %i, i64 %j
%c.load.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @C, i64 %i.inc, i64 %j
%c.store.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @C, i64 %i, i64 %j
%a = load float, ptr %a.load.index, align 4
%b = load float, ptr %b.index, align 4
%c0 = load float, ptr %c.load.index, align 4
%c1 = load float, ptr %c.store.index, align 4
%add.0 = fadd float %a, %b
%a.store.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @A, i64 %i, i64 %j
store float %add.0, ptr %a.store.index, align 4
%add.1 = fadd float %c0, %c1
store float %add.1, ptr %c.store.index, align 4
%j.next = add nuw nsw i64 %j, 1
%cmp.j = icmp eq i64 %j.next, 256
br i1 %cmp.j, label %for.i.inc, label %for.j.body

for.i.inc:
%i.next = add nuw nsw i64 %i, 1
%cmp.i = icmp eq i64 %i.next, 255
br i1 %cmp.i, label %exit, label %for.i.header

exit:
ret void
}