From b1c4248c4a242698feedb6d69d61d07dbcca407c Mon Sep 17 00:00:00 2001 From: Ryotaro Kasuga Date: Thu, 27 Mar 2025 10:45:26 +0000 Subject: [PATCH 1/3] [LoopInterchange] Improve profitability check for vectorization The vectorization profitability has a process to check whether a given loop can be vectorized or not. Since the process is conservative, a loop that can be vectorized may be deemed not to be possible. This can trigger unnecessary exchanges. This patch improves the profitability decision by mitigating such misjudgments. Before this patch, we considered a loop to be vectorizable only when there are no loop carried dependencies with the IV of the loop. However, a loop carried dependency doesn't prevent vectorization if the distance is positive. This patch makes the vectorization check more accurate by allowing a loop with the positive dependency. Note that it is difficult to make a complete decision whether a loop can be vectorized or not. To achieve this, we must check the vector width and the distance of dependency. --- .../lib/Transforms/Scalar/LoopInterchange.cpp | 128 ++++++++++++++---- .../profitability-vectorization-heuristic.ll | 8 +- 2 files changed, 106 insertions(+), 30 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index 1dccba4cfa7b8..078da53c52b52 100644 --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -17,8 +17,8 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" -#include "llvm/ADT/StringSet.h" #include "llvm/Analysis/DependenceAnalysis.h" #include "llvm/Analysis/LoopCacheAnalysis.h" #include "llvm/Analysis/LoopInfo.h" @@ -80,6 +80,21 @@ enum class RuleTy { ForVectorization, }; +/// Store the information about if corresponding direction vector was negated +/// by normalization or not. This is necessary to restore the original one from +/// a row of a dependency matrix, because we only manage normalized direction +/// vectors and duplicate vectors are eliminated. So there may be both original +/// and negated vectors for a single entry (a row of dependency matrix). E.g., +/// if there are two direction vectors `[< =]` and `[> =]`, the later one will +/// be converted to the same as former one by normalization, so only `[< =]` +/// would be retained in the final result. +struct NegatedStatus { + bool Original = false; + bool Negated = false; + + bool isNonNegativeDir(char Dir) const; +}; + } // end anonymous namespace // Minimum loop depth supported. @@ -126,9 +141,10 @@ static void printDepMatrix(CharMatrix &DepMatrix) { } #endif -static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, - Loop *L, DependenceInfo *DI, - ScalarEvolution *SE, +static bool populateDependencyMatrix(CharMatrix &DepMatrix, + std::vector &NegStatusVec, + unsigned Level, Loop *L, + DependenceInfo *DI, ScalarEvolution *SE, OptimizationRemarkEmitter *ORE) { using ValueVector = SmallVector; @@ -167,7 +183,9 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, return false; } ValueVector::iterator I, IE, J, JE; - StringSet<> Seen; + + // Manage all found direction vectors. and map it to the index of DepMatrix. + StringMap Seen; for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) { for (J = I, JE = MemInstr.end(); J != JE; ++J) { @@ -182,7 +200,8 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, assert(D->isOrdered() && "Expected an output, flow or anti dep."); // If the direction vector is negative, normalize it to // make it non-negative. - if (D->normalize(SE)) + bool Normalized = D->normalize(SE); + if (Normalized) LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n"); LLVM_DEBUG(StringRef DepType = D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output"; @@ -214,8 +233,17 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, } // Make sure we only add unique entries to the dependency matrix. - if (Seen.insert(StringRef(Dep.data(), Dep.size())).second) + unsigned Index = DepMatrix.size(); + auto [Ite, Inserted] = + Seen.try_emplace(StringRef(Dep.data(), Dep.size()), Index); + if (Inserted) { DepMatrix.push_back(Dep); + NegStatusVec.push_back(NegatedStatus{}); + } else + Index = Ite->second; + + NegatedStatus &Status = NegStatusVec[Index]; + (Normalized ? Status.Negated : Status.Original) = true; } } } @@ -400,6 +428,7 @@ class LoopInterchangeProfitability { bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix, + const std::vector &NegStatusVec, const DenseMap &CostMap, std::unique_ptr &CC); @@ -409,9 +438,10 @@ class LoopInterchangeProfitability { const DenseMap &CostMap, std::unique_ptr &CC); std::optional isProfitablePerInstrOrderCost(); - std::optional isProfitableForVectorization(unsigned InnerLoopId, - unsigned OuterLoopId, - CharMatrix &DepMatrix); + std::optional + isProfitableForVectorization(unsigned InnerLoopId, unsigned OuterLoopId, + CharMatrix &DepMatrix, + const std::vector &NegStatusVec); Loop *OuterLoop; Loop *InnerLoop; @@ -503,8 +533,9 @@ struct LoopInterchange { << "\n"); CharMatrix DependencyMatrix; + std::vector NegStatusVec; Loop *OuterMostLoop = *(LoopList.begin()); - if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth, + if (!populateDependencyMatrix(DependencyMatrix, NegStatusVec, LoopNestDepth, OuterMostLoop, DI, SE, ORE)) { LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n"); return false; @@ -543,8 +574,8 @@ struct LoopInterchange { for (unsigned j = SelecLoopId; j > 0; j--) { bool ChangedPerIter = false; for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) { - bool Interchanged = - processLoop(LoopList, i, i - 1, DependencyMatrix, CostMap); + bool Interchanged = processLoop(LoopList, i, i - 1, DependencyMatrix, + NegStatusVec, CostMap); ChangedPerIter |= Interchanged; Changed |= Interchanged; } @@ -559,6 +590,8 @@ struct LoopInterchange { bool processLoop(SmallVectorImpl &LoopList, unsigned InnerLoopId, unsigned OuterLoopId, std::vector> &DependencyMatrix, + + const std::vector &NegStatusVec, const DenseMap &CostMap) { Loop *OuterLoop = LoopList[OuterLoopId]; Loop *InnerLoop = LoopList[InnerLoopId]; @@ -572,7 +605,7 @@ struct LoopInterchange { LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n"); LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE); if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId, - DependencyMatrix, CostMap, CC)) { + DependencyMatrix, NegStatusVec, CostMap, CC)) { LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n"); return false; } @@ -1197,27 +1230,71 @@ LoopInterchangeProfitability::isProfitablePerInstrOrderCost() { return std::nullopt; } +static char flipDirection(char Dir) { + switch (Dir) { + case '<': + return '>'; + case '>': + return '<'; + case '=': + case 'I': + case '*': + return Dir; + default: + llvm_unreachable("Unknown direction"); + } +} + +/// Ensure that there are no negative direction dependencies corresponding to \p +/// Dir. +bool NegatedStatus::isNonNegativeDir(char Dir) const { + assert((Original || Negated) && "Cannot restore the original direction"); + + // If both flag is true, it means that there is both as-is and negated + // direction. In this case only `=` or `I` don't have negative direction + // dependency. + if (Original && Negated) + return Dir == '=' || Dir == 'I'; + + char Restored = Negated ? flipDirection(Dir) : Dir; + return Restored == '=' || Restored == 'I' || Restored == '<'; +} + /// Return true if we can vectorize the loop specified by \p LoopId. -static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) { +static bool canVectorize(const CharMatrix &DepMatrix, + const std::vector &NegStatusVec, + unsigned LoopId) { + // The loop can be vectorized if there are no negative dependencies. Consider + // the dependency of `j` in the following example. + // + // Positive: ... = A[i][j] Negative: ... = A[i][j-1] + // A[i][j-1] = ... A[i][j] = ... + // + // In the right case, vectorizing the loop can change the loaded value from + // `A[i][j-1]`. At the moment we don't take into account the distance of the + // dependency and vector width. + // TODO: Considering the dependency distance and the vector width can give a + // more accurate result. For example, the following loop can be vectorized if + // the vector width is less than or equal to 4 x sizeof(A[0][0]). for (unsigned I = 0; I != DepMatrix.size(); I++) { char Dir = DepMatrix[I][LoopId]; - if (Dir != 'I' && Dir != '=') + if (!NegStatusVec[I].isNonNegativeDir(Dir)) return false; } return true; } std::optional LoopInterchangeProfitability::isProfitableForVectorization( - unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) { - // If the outer loop is not loop independent it is not profitable to move - // this to inner position, since doing so would not enable inner loop - // parallelism. - if (!canVectorize(DepMatrix, OuterLoopId)) + unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix, + const std::vector &NegStatusVec) { + // If the outer loop cannot be vectorized, it is not profitable to move this + // to inner position. + if (!canVectorize(DepMatrix, NegStatusVec, OuterLoopId)) return false; - // If inner loop has dependence and outer loop is loop independent then it is + // If inner loop cannot be vectorized and outer loop can be then it is // profitable to interchange to enable inner loop parallelism. - if (!canVectorize(DepMatrix, InnerLoopId)) + if (!canVectorize(DepMatrix, NegStatusVec, InnerLoopId)) return true; // If both the inner and the outer loop can be vectorized, it is necessary to @@ -1231,6 +1308,7 @@ std::optional LoopInterchangeProfitability::isProfitableForVectorization( bool LoopInterchangeProfitability::isProfitable( const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix, + const std::vector &NegStatusVec, const DenseMap &CostMap, std::unique_ptr &CC) { // isProfitable() is structured to avoid endless loop interchange. If the @@ -1252,8 +1330,8 @@ bool LoopInterchangeProfitability::isProfitable( shouldInterchange = isProfitablePerInstrOrderCost(); break; case RuleTy::ForVectorization: - shouldInterchange = - isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix); + shouldInterchange = isProfitableForVectorization(InnerLoopId, OuterLoopId, + DepMatrix, NegStatusVec); break; } diff --git a/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll b/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll index 0f5aee582373d..14c2046eebbb4 100644 --- a/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll +++ b/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll @@ -64,15 +64,13 @@ exit: ; for (int j = 1; j < 256; j++) ; A[i][j-1] = A[i][j] + B[i][j]; ; -; FIXME: These loops are exchanged at this time due to the problem in -; profitability heuristic calculation for vectorization. -; CHECK: --- !Passed +; CHECK: --- !Missed ; CHECK-NEXT: Pass: loop-interchange -; CHECK-NEXT: Name: Interchanged +; CHECK-NEXT: Name: InterchangeNotProfitable ; CHECK-NEXT: Function: interchange_unnecesasry_for_vectorization ; CHECK-NEXT: Args: -; CHECK-NEXT: - String: Loop interchanged with enclosing loop. +; CHECK-NEXT: - String: Insufficient information to calculate the cost of loop for interchange. define void @interchange_unnecesasry_for_vectorization() { entry: br label %for.i.header From 8f4f814b01d2ad5cab1962513adc8bf7deeec012 Mon Sep 17 00:00:00 2001 From: Ryotaro Kasuga Date: Thu, 3 Apr 2025 09:55:13 +0000 Subject: [PATCH 2/3] Handle negated and non negated direction vectors separately. --- .../lib/Transforms/Scalar/LoopInterchange.cpp | 90 ++++++------------- 1 file changed, 27 insertions(+), 63 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index 078da53c52b52..fe33ee33258f1 100644 --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -19,6 +19,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" +#include "llvm/ADT/StringSet.h" #include "llvm/Analysis/DependenceAnalysis.h" #include "llvm/Analysis/LoopCacheAnalysis.h" #include "llvm/Analysis/LoopInfo.h" @@ -80,21 +81,6 @@ enum class RuleTy { ForVectorization, }; -/// Store the information about if corresponding direction vector was negated -/// by normalization or not. This is necessary to restore the original one from -/// a row of a dependency matrix, because we only manage normalized direction -/// vectors and duplicate vectors are eliminated. So there may be both original -/// and negated vectors for a single entry (a row of dependency matrix). E.g., -/// if there are two direction vectors `[< =]` and `[> =]`, the later one will -/// be converted to the same as former one by normalization, so only `[< =]` -/// would be retained in the final result. -struct NegatedStatus { - bool Original = false; - bool Negated = false; - - bool isNonNegativeDir(char Dir) const; -}; - } // end anonymous namespace // Minimum loop depth supported. @@ -142,9 +128,9 @@ static void printDepMatrix(CharMatrix &DepMatrix) { #endif static bool populateDependencyMatrix(CharMatrix &DepMatrix, - std::vector &NegStatusVec, - unsigned Level, Loop *L, - DependenceInfo *DI, ScalarEvolution *SE, + BitVector &IsNegatedVec, unsigned Level, + Loop *L, DependenceInfo *DI, + ScalarEvolution *SE, OptimizationRemarkEmitter *ORE) { using ValueVector = SmallVector; @@ -184,8 +170,8 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, } ValueVector::iterator I, IE, J, JE; - // Manage all found direction vectors. and map it to the index of DepMatrix. - StringMap Seen; + // Manage all found direction vectors, negated and not negated, separately. + StringSet<> Seen[2]; for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) { for (J = I, JE = MemInstr.end(); J != JE; ++J) { @@ -233,17 +219,12 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, } // Make sure we only add unique entries to the dependency matrix. - unsigned Index = DepMatrix.size(); - auto [Ite, Inserted] = - Seen.try_emplace(StringRef(Dep.data(), Dep.size()), Index); - if (Inserted) { + // Negated vectors (due to normalization) are treated as separate from + // non negated ones. + if (Seen[Normalized].insert(StringRef(Dep.data(), Dep.size())).second) { DepMatrix.push_back(Dep); - NegStatusVec.push_back(NegatedStatus{}); - } else - Index = Ite->second; - - NegatedStatus &Status = NegStatusVec[Index]; - (Normalized ? Status.Negated : Status.Original) = true; + IsNegatedVec.push_back(Normalized); + } } } } @@ -427,8 +408,7 @@ class LoopInterchangeProfitability { /// Check if the loop interchange is profitable. bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId, unsigned OuterLoopId, - CharMatrix &DepMatrix, - const std::vector &NegStatusVec, + CharMatrix &DepMatrix, const BitVector &IsNegatedVec, const DenseMap &CostMap, std::unique_ptr &CC); @@ -441,7 +421,7 @@ class LoopInterchangeProfitability { std::optional isProfitableForVectorization(unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix, - const std::vector &NegStatusVec); + const BitVector &IsNegatedVec); Loop *OuterLoop; Loop *InnerLoop; @@ -533,9 +513,9 @@ struct LoopInterchange { << "\n"); CharMatrix DependencyMatrix; - std::vector NegStatusVec; + BitVector IsNegatedVec; Loop *OuterMostLoop = *(LoopList.begin()); - if (!populateDependencyMatrix(DependencyMatrix, NegStatusVec, LoopNestDepth, + if (!populateDependencyMatrix(DependencyMatrix, IsNegatedVec, LoopNestDepth, OuterMostLoop, DI, SE, ORE)) { LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n"); return false; @@ -575,7 +555,7 @@ struct LoopInterchange { bool ChangedPerIter = false; for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) { bool Interchanged = processLoop(LoopList, i, i - 1, DependencyMatrix, - NegStatusVec, CostMap); + IsNegatedVec, CostMap); ChangedPerIter |= Interchanged; Changed |= Interchanged; } @@ -590,8 +570,7 @@ struct LoopInterchange { bool processLoop(SmallVectorImpl &LoopList, unsigned InnerLoopId, unsigned OuterLoopId, std::vector> &DependencyMatrix, - - const std::vector &NegStatusVec, + BitVector &IsNegatedVec, const DenseMap &CostMap) { Loop *OuterLoop = LoopList[OuterLoopId]; Loop *InnerLoop = LoopList[InnerLoopId]; @@ -605,7 +584,7 @@ struct LoopInterchange { LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n"); LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE); if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId, - DependencyMatrix, NegStatusVec, CostMap, CC)) { + DependencyMatrix, IsNegatedVec, CostMap, CC)) { LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n"); return false; } @@ -1245,25 +1224,9 @@ static char flipDirection(char Dir) { } } -/// Ensure that there are no negative direction dependencies corresponding to \p -/// Dir. -bool NegatedStatus::isNonNegativeDir(char Dir) const { - assert((Original || Negated) && "Cannot restore the original direction"); - - // If both flag is true, it means that there is both as-is and negated - // direction. In this case only `=` or `I` don't have negative direction - // dependency. - if (Original && Negated) - return Dir == '=' || Dir == 'I'; - - char Restored = Negated ? flipDirection(Dir) : Dir; - return Restored == '=' || Restored == 'I' || Restored == '<'; -} - /// Return true if we can vectorize the loop specified by \p LoopId. static bool canVectorize(const CharMatrix &DepMatrix, - const std::vector &NegStatusVec, - unsigned LoopId) { + const BitVector &IsNegatedVec, unsigned LoopId) { // The loop can be vectorized if there are no negative dependencies. Consider // the dependency of `j` in the following example. // @@ -1278,7 +1241,9 @@ static bool canVectorize(const CharMatrix &DepMatrix, // the vector width is less than or equal to 4 x sizeof(A[0][0]). for (unsigned I = 0; I != DepMatrix.size(); I++) { char Dir = DepMatrix[I][LoopId]; - if (!NegStatusVec[I].isNonNegativeDir(Dir)) + if (IsNegatedVec[I]) + Dir = flipDirection(Dir); + if (Dir != '=' && Dir != 'I' && Dir != '<') return false; } return true; @@ -1286,15 +1251,15 @@ static bool canVectorize(const CharMatrix &DepMatrix, std::optional LoopInterchangeProfitability::isProfitableForVectorization( unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix, - const std::vector &NegStatusVec) { + const BitVector &IsNegatedVec) { // If the outer loop cannot be vectorized, it is not profitable to move this // to inner position. - if (!canVectorize(DepMatrix, NegStatusVec, OuterLoopId)) + if (!canVectorize(DepMatrix, IsNegatedVec, OuterLoopId)) return false; // If inner loop cannot be vectorized and outer loop can be then it is // profitable to interchange to enable inner loop parallelism. - if (!canVectorize(DepMatrix, NegStatusVec, InnerLoopId)) + if (!canVectorize(DepMatrix, IsNegatedVec, InnerLoopId)) return true; // If both the inner and the outer loop can be vectorized, it is necessary to @@ -1307,8 +1272,7 @@ std::optional LoopInterchangeProfitability::isProfitableForVectorization( bool LoopInterchangeProfitability::isProfitable( const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId, - unsigned OuterLoopId, CharMatrix &DepMatrix, - const std::vector &NegStatusVec, + unsigned OuterLoopId, CharMatrix &DepMatrix, const BitVector &IsNegatedVec, const DenseMap &CostMap, std::unique_ptr &CC) { // isProfitable() is structured to avoid endless loop interchange. If the @@ -1331,7 +1295,7 @@ bool LoopInterchangeProfitability::isProfitable( break; case RuleTy::ForVectorization: shouldInterchange = isProfitableForVectorization(InnerLoopId, OuterLoopId, - DepMatrix, NegStatusVec); + DepMatrix, IsNegatedVec); break; } From cad4db91a1c86941a4eabf17a9accc0df3ec65f2 Mon Sep 17 00:00:00 2001 From: Ryotaro Kasuga Date: Tue, 8 Apr 2025 14:58:57 +0000 Subject: [PATCH 3/3] Add test that has positive dependencies --- .../profitability-vectorization-heuristic.ll | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll b/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll index 14c2046eebbb4..7108d3adf5d79 100644 --- a/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll +++ b/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll @@ -101,3 +101,59 @@ for.i.inc: exit: ret void } + +; Check that the below loops are exchanged to allow innermost loop +; vectorization. We cannot vectorize the j-loop because it has negative +; distance dependency, but the i-loop can be vectorized. +; +; for (int i = 0; i < 255; i++) { +; for (int j = 1; j < 256; j++) { +; A[i][j] = A[i][j-1] + B[i][j]; +; C[i][j] += C[i+1][j]; +; } +; } +; + +; CHECK: --- !Passed +; CHECK-NEXT: Pass: loop-interchange +; CHECK-NEXT: Name: Interchanged +; CHECK-NEXT: Function: interchange_necessary_for_vectorization2 +; CHECK-NEXT: Args: +; CHECK-NEXT: - String: Loop interchanged with enclosing loop. +define void @interchange_necessary_for_vectorization2() { +entry: + br label %for.i.header + +for.i.header: + %i = phi i64 [ 1, %entry ], [ %i.next, %for.i.inc ] + %i.inc = add nsw i64 %i, 1 + br label %for.j.body + +for.j.body: + %j = phi i64 [ 1, %for.i.header ], [ %j.next, %for.j.body ] + %j.dec = add nsw i64 %j, -1 + %a.load.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @A, i64 %i, i64 %j.dec + %b.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @B, i64 %i, i64 %j + %c.load.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @C, i64 %i.inc, i64 %j + %c.store.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @C, i64 %i, i64 %j + %a = load float, ptr %a.load.index, align 4 + %b = load float, ptr %b.index, align 4 + %c0 = load float, ptr %c.load.index, align 4 + %c1 = load float, ptr %c.store.index, align 4 + %add.0 = fadd float %a, %b + %a.store.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @A, i64 %i, i64 %j + store float %add.0, ptr %a.store.index, align 4 + %add.1 = fadd float %c0, %c1 + store float %add.1, ptr %c.store.index, align 4 + %j.next = add nuw nsw i64 %j, 1 + %cmp.j = icmp eq i64 %j.next, 256 + br i1 %cmp.j, label %for.i.inc, label %for.j.body + +for.i.inc: + %i.next = add nuw nsw i64 %i, 1 + %cmp.i = icmp eq i64 %i.next, 255 + br i1 %cmp.i, label %exit, label %for.i.header + +exit: + ret void +}