Skip to content

[LoopInterchange] Add tests for the vectorization profitability (NFC) #133665

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 2, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 \
; RUN: -pass-remarks-output=%t -disable-output -loop-interchange-profitabilities=vectorize
; RUN: FileCheck -input-file %t %s

@A = dso_local global [256 x [256 x float]] zeroinitializer
@B = dso_local global [256 x [256 x float]] zeroinitializer
@C = dso_local global [256 x [256 x float]] zeroinitializer

; Check that the below loops are exchanged for vectorization.
;
; for (int i = 0; i < 256; i++) {
; for (int j = 1; j < 256; j++) {
; A[i][j] = A[i][j-1] + B[i][j];
; C[i][j] += 1;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just two question about this out of curiousity:

  • is the write to array C necessary?
  • and is vectorised with a gather/scatter?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is the write to array C necessary?

Yes. Due to the existence of the write to C, the profitability check makes a wrong decision as "we can vectorize the j-loop". Please see the description of #133667 for more details if you are interested in.

and is vectorised with a gather/scatter?

Maybe yes, if the i-loop becomes the innermost one by interchange and it is vectorized. Note that this depends on LoopVectorizer whether the loop is actually vectorized or not.

The current vectorization profitability check in LoopInterchange only checks if a loop can be vectorized or we cannot due to some dependency. To make better decisions, I think we should estimate the vectorization cost.

; }
; }
;
; FIXME: These loops are not exchanged at this time due to the problem in
; profitability heuristic calculation for vectorization.

; CHECK: --- !Missed
; CHECK-NEXT: Pass: loop-interchange
; CHECK-NEXT: Name: InterchangeNotProfitable
; CHECK-NEXT: Function: interchange_necessary_for_vectorization
; CHECK-NEXT: Args:
; CHECK-NEXT: - String: Interchanging loops is not considered to improve cache locality nor vectorization.
; CHECK-NEXT: ...
define void @interchange_necessary_for_vectorization() {
entry:
br label %for.i.header

for.i.header:
%i = phi i64 [ 1, %entry ], [ %i.next, %for.i.inc ]
br label %for.j.body

for.j.body:
%j = phi i64 [ 1, %for.i.header ], [ %j.next, %for.j.body ]
%j.dec = add nsw i64 %j, -1
%a.load.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @A, i64 %i, i64 %j.dec
%b.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @B, i64 %i, i64 %j
%c.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @C, i64 %i, i64 %j
%a = load float, ptr %a.load.index, align 4
%b = load float, ptr %b.index, align 4
%c = load float, ptr %c.index, align 4
%add.0 = fadd float %a, %b
%a.store.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @A, i64 %i, i64 %j
store float %add.0, ptr %a.store.index, align 4
%add.1 = fadd float %c, 1.0
store float %add.1, ptr %c.index, align 4
%j.next = add nuw nsw i64 %j, 1
%cmp.j = icmp eq i64 %j.next, 256
br i1 %cmp.j, label %for.i.inc, label %for.j.body

for.i.inc:
%i.next = add nuw nsw i64 %i, 1
%cmp.i = icmp eq i64 %i.next, 256
br i1 %cmp.i, label %exit, label %for.i.header

exit:
ret void
}

; Check that the following innermost loop can be vectorized so that
; interchanging is unnecessary.
;
; for (int i = 0; i < 256; i++)
; for (int j = 1; j < 256; j++)
; A[i][j-1] = A[i][j] + B[i][j];
;
; FIXME: These loops are exchanged at this time due to the problem in
; profitability heuristic calculation for vectorization.

; CHECK: --- !Passed
; CHECK-NEXT: Pass: loop-interchange
; CHECK-NEXT: Name: Interchanged
; CHECK-NEXT: Function: interchange_unnecesasry_for_vectorization
; CHECK-NEXT: Args:
; CHECK-NEXT: - String: Loop interchanged with enclosing loop.
define void @interchange_unnecesasry_for_vectorization() {
entry:
br label %for.i.header

for.i.header:
%i = phi i64 [ 1, %entry ], [ %i.next, %for.i.inc ]
br label %for.j.body

for.j.body:
%j = phi i64 [ 1, %for.i.header ], [ %j.next, %for.j.body ]
%j.dec = add nsw i64 %j, -1
%a.load.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @A, i64 %i, i64 %j
%b.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @B, i64 %i, i64 %j
%a = load float, ptr %a.load.index, align 4
%b = load float, ptr %b.index, align 4
%add = fadd float %a, %b
%a.store.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @A, i64 %i, i64 %j.dec
store float %add, ptr %a.store.index, align 4
%j.next = add nuw nsw i64 %j, 1
%cmp.j = icmp eq i64 %j.next, 256
br i1 %cmp.j, label %for.i.inc, label %for.j.body

for.i.inc:
%i.next = add nuw nsw i64 %i, 1
%cmp.i = icmp eq i64 %i.next, 256
br i1 %cmp.i, label %exit, label %for.i.header

exit:
ret void
}