-
Notifications
You must be signed in to change notification settings - Fork 13.4k
[NVPTX] Restrict combining to properly aligned v16i8 vectors. #107919
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Fixes generation of invalid loads leading to misaligned access errors. Update tests to use automatic check generator.
@llvm/pr-subscribers-backend-nvptx Author: Artem Belevich (Artem-B) ChangesFixes generation of invalid loads leading to misaligned access errors. Also updated the tests to use automatic check generator. Patch is 21.16 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/107919.diff 2 Files Affected:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index bb76ffdfd99d7b..5c5766a8b23455 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -6038,7 +6038,11 @@ static SDValue PerformLOADCombine(SDNode *N,
// elements can be optimised away instead of being needlessly split during
// legalization, which involves storing to the stack and loading it back.
EVT VT = N->getValueType(0);
- if (VT != MVT::v16i8)
+ bool CorrectlyAligned =
+ DCI.DAG.getTargetLoweringInfo().allowsMemoryAccessForAlignment(
+ *DAG.getContext(), DAG.getDataLayout(), LD->getMemoryVT(),
+ *LD->getMemOperand());
+ if (!(VT == MVT::v16i8 && CorrectlyAligned))
return SDValue();
SDLoc DL(N);
diff --git a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
index 868a06e2a850cc..bc58a700cb9828 100644
--- a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
+++ b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s | FileCheck -check-prefix=ENABLED %s
; RUN: llc -disable-nvptx-load-store-vectorizer < %s | FileCheck -check-prefix=DISABLED %s
; RUN: %if ptxas %{ llc < %s | %ptxas-verify %}
@@ -8,10 +9,31 @@ target triple = "nvptx64-nvidia-cuda"
; Check that the load-store vectorizer is enabled by default for nvptx, and
; that it's disabled by the appropriate flag.
-; ENABLED: ld.v2.{{.}}32
-; DISABLED: ld.{{.}}32
-; DISABLED: ld.{{.}}32
define i32 @f(ptr %p) {
+; ENABLED-LABEL: f(
+; ENABLED: {
+; ENABLED-NEXT: .reg .b32 %r<4>;
+; ENABLED-NEXT: .reg .b64 %rd<2>;
+; ENABLED-EMPTY:
+; ENABLED-NEXT: // %bb.0:
+; ENABLED-NEXT: ld.param.u64 %rd1, [f_param_0];
+; ENABLED-NEXT: ld.v2.u32 {%r1, %r2}, [%rd1];
+; ENABLED-NEXT: add.s32 %r3, %r1, %r2;
+; ENABLED-NEXT: st.param.b32 [func_retval0+0], %r3;
+; ENABLED-NEXT: ret;
+;
+; DISABLED-LABEL: f(
+; DISABLED: {
+; DISABLED-NEXT: .reg .b32 %r<4>;
+; DISABLED-NEXT: .reg .b64 %rd<2>;
+; DISABLED-EMPTY:
+; DISABLED-NEXT: // %bb.0:
+; DISABLED-NEXT: ld.param.u64 %rd1, [f_param_0];
+; DISABLED-NEXT: ld.u32 %r1, [%rd1];
+; DISABLED-NEXT: ld.u32 %r2, [%rd1+4];
+; DISABLED-NEXT: add.s32 %r3, %r1, %r2;
+; DISABLED-NEXT: st.param.b32 [func_retval0+0], %r3;
+; DISABLED-NEXT: ret;
%p.1 = getelementptr i32, ptr %p, i32 1
%v0 = load i32, ptr %p, align 8
%v1 = load i32, ptr %p.1, align 4
@@ -20,6 +42,66 @@ define i32 @f(ptr %p) {
}
define half @fh(ptr %p) {
+; ENABLED-LABEL: fh(
+; ENABLED: {
+; ENABLED-NEXT: .reg .b16 %rs<10>;
+; ENABLED-NEXT: .reg .f32 %f<13>;
+; ENABLED-NEXT: .reg .b64 %rd<2>;
+; ENABLED-EMPTY:
+; ENABLED-NEXT: // %bb.0:
+; ENABLED-NEXT: ld.param.u64 %rd1, [fh_param_0];
+; ENABLED-NEXT: ld.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
+; ENABLED-NEXT: ld.b16 %rs5, [%rd1+8];
+; ENABLED-NEXT: cvt.f32.f16 %f1, %rs2;
+; ENABLED-NEXT: cvt.f32.f16 %f2, %rs1;
+; ENABLED-NEXT: add.rn.f32 %f3, %f2, %f1;
+; ENABLED-NEXT: cvt.rn.f16.f32 %rs6, %f3;
+; ENABLED-NEXT: cvt.f32.f16 %f4, %rs4;
+; ENABLED-NEXT: cvt.f32.f16 %f5, %rs3;
+; ENABLED-NEXT: add.rn.f32 %f6, %f5, %f4;
+; ENABLED-NEXT: cvt.rn.f16.f32 %rs7, %f6;
+; ENABLED-NEXT: cvt.f32.f16 %f7, %rs7;
+; ENABLED-NEXT: cvt.f32.f16 %f8, %rs6;
+; ENABLED-NEXT: add.rn.f32 %f9, %f8, %f7;
+; ENABLED-NEXT: cvt.rn.f16.f32 %rs8, %f9;
+; ENABLED-NEXT: cvt.f32.f16 %f10, %rs8;
+; ENABLED-NEXT: cvt.f32.f16 %f11, %rs5;
+; ENABLED-NEXT: add.rn.f32 %f12, %f10, %f11;
+; ENABLED-NEXT: cvt.rn.f16.f32 %rs9, %f12;
+; ENABLED-NEXT: st.param.b16 [func_retval0+0], %rs9;
+; ENABLED-NEXT: ret;
+;
+; DISABLED-LABEL: fh(
+; DISABLED: {
+; DISABLED-NEXT: .reg .b16 %rs<10>;
+; DISABLED-NEXT: .reg .f32 %f<13>;
+; DISABLED-NEXT: .reg .b64 %rd<2>;
+; DISABLED-EMPTY:
+; DISABLED-NEXT: // %bb.0:
+; DISABLED-NEXT: ld.param.u64 %rd1, [fh_param_0];
+; DISABLED-NEXT: ld.b16 %rs1, [%rd1];
+; DISABLED-NEXT: ld.b16 %rs2, [%rd1+2];
+; DISABLED-NEXT: ld.b16 %rs3, [%rd1+4];
+; DISABLED-NEXT: ld.b16 %rs4, [%rd1+6];
+; DISABLED-NEXT: ld.b16 %rs5, [%rd1+8];
+; DISABLED-NEXT: cvt.f32.f16 %f1, %rs2;
+; DISABLED-NEXT: cvt.f32.f16 %f2, %rs1;
+; DISABLED-NEXT: add.rn.f32 %f3, %f2, %f1;
+; DISABLED-NEXT: cvt.rn.f16.f32 %rs6, %f3;
+; DISABLED-NEXT: cvt.f32.f16 %f4, %rs4;
+; DISABLED-NEXT: cvt.f32.f16 %f5, %rs3;
+; DISABLED-NEXT: add.rn.f32 %f6, %f5, %f4;
+; DISABLED-NEXT: cvt.rn.f16.f32 %rs7, %f6;
+; DISABLED-NEXT: cvt.f32.f16 %f7, %rs7;
+; DISABLED-NEXT: cvt.f32.f16 %f8, %rs6;
+; DISABLED-NEXT: add.rn.f32 %f9, %f8, %f7;
+; DISABLED-NEXT: cvt.rn.f16.f32 %rs8, %f9;
+; DISABLED-NEXT: cvt.f32.f16 %f10, %rs8;
+; DISABLED-NEXT: cvt.f32.f16 %f11, %rs5;
+; DISABLED-NEXT: add.rn.f32 %f12, %f10, %f11;
+; DISABLED-NEXT: cvt.rn.f16.f32 %rs9, %f12;
+; DISABLED-NEXT: st.param.b16 [func_retval0+0], %rs9;
+; DISABLED-NEXT: ret;
%p.1 = getelementptr half, ptr %p, i32 1
%p.2 = getelementptr half, ptr %p, i32 2
%p.3 = getelementptr half, ptr %p, i32 3
@@ -37,6 +119,40 @@ define half @fh(ptr %p) {
}
define float @ff(ptr %p) {
+; ENABLED-LABEL: ff(
+; ENABLED: {
+; ENABLED-NEXT: .reg .f32 %f<10>;
+; ENABLED-NEXT: .reg .b64 %rd<2>;
+; ENABLED-EMPTY:
+; ENABLED-NEXT: // %bb.0:
+; ENABLED-NEXT: ld.param.u64 %rd1, [ff_param_0];
+; ENABLED-NEXT: ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
+; ENABLED-NEXT: ld.f32 %f5, [%rd1+16];
+; ENABLED-NEXT: add.rn.f32 %f6, %f1, %f2;
+; ENABLED-NEXT: add.rn.f32 %f7, %f3, %f4;
+; ENABLED-NEXT: add.rn.f32 %f8, %f6, %f7;
+; ENABLED-NEXT: add.rn.f32 %f9, %f8, %f5;
+; ENABLED-NEXT: st.param.f32 [func_retval0+0], %f9;
+; ENABLED-NEXT: ret;
+;
+; DISABLED-LABEL: ff(
+; DISABLED: {
+; DISABLED-NEXT: .reg .f32 %f<10>;
+; DISABLED-NEXT: .reg .b64 %rd<2>;
+; DISABLED-EMPTY:
+; DISABLED-NEXT: // %bb.0:
+; DISABLED-NEXT: ld.param.u64 %rd1, [ff_param_0];
+; DISABLED-NEXT: ld.f32 %f1, [%rd1];
+; DISABLED-NEXT: ld.f32 %f2, [%rd1+4];
+; DISABLED-NEXT: ld.f32 %f3, [%rd1+8];
+; DISABLED-NEXT: ld.f32 %f4, [%rd1+12];
+; DISABLED-NEXT: ld.f32 %f5, [%rd1+16];
+; DISABLED-NEXT: add.rn.f32 %f6, %f1, %f2;
+; DISABLED-NEXT: add.rn.f32 %f7, %f3, %f4;
+; DISABLED-NEXT: add.rn.f32 %f8, %f6, %f7;
+; DISABLED-NEXT: add.rn.f32 %f9, %f8, %f5;
+; DISABLED-NEXT: st.param.f32 [func_retval0+0], %f9;
+; DISABLED-NEXT: ret;
%p.1 = getelementptr float, ptr %p, i32 1
%p.2 = getelementptr float, ptr %p, i32 2
%p.3 = getelementptr float, ptr %p, i32 3
@@ -54,8 +170,90 @@ define float @ff(ptr %p) {
}
define void @combine_v16i8(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr2) {
- ; ENABLED-LABEL: combine_v16i8
- ; ENABLED: ld.v4.u32
+; ENABLED-LABEL: combine_v16i8(
+; ENABLED: {
+; ENABLED-NEXT: .reg .b32 %r<40>;
+; ENABLED-NEXT: .reg .b64 %rd<3>;
+; ENABLED-EMPTY:
+; ENABLED-NEXT: // %bb.0:
+; ENABLED-NEXT: ld.param.u64 %rd1, [combine_v16i8_param_0];
+; ENABLED-NEXT: ld.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
+; ENABLED-NEXT: ld.param.u64 %rd2, [combine_v16i8_param_1];
+; ENABLED-NEXT: bfe.u32 %r9, %r1, 0, 8;
+; ENABLED-NEXT: bfe.u32 %r10, %r1, 8, 8;
+; ENABLED-NEXT: bfe.u32 %r11, %r1, 16, 8;
+; ENABLED-NEXT: bfe.u32 %r12, %r1, 24, 8;
+; ENABLED-NEXT: bfe.u32 %r13, %r2, 0, 8;
+; ENABLED-NEXT: bfe.u32 %r14, %r2, 8, 8;
+; ENABLED-NEXT: bfe.u32 %r15, %r2, 16, 8;
+; ENABLED-NEXT: bfe.u32 %r16, %r2, 24, 8;
+; ENABLED-NEXT: bfe.u32 %r17, %r3, 0, 8;
+; ENABLED-NEXT: bfe.u32 %r18, %r3, 8, 8;
+; ENABLED-NEXT: bfe.u32 %r19, %r3, 16, 8;
+; ENABLED-NEXT: bfe.u32 %r20, %r3, 24, 8;
+; ENABLED-NEXT: bfe.u32 %r21, %r4, 0, 8;
+; ENABLED-NEXT: bfe.u32 %r22, %r4, 8, 8;
+; ENABLED-NEXT: bfe.u32 %r23, %r4, 16, 8;
+; ENABLED-NEXT: bfe.u32 %r24, %r4, 24, 8;
+; ENABLED-NEXT: add.s32 %r25, %r9, %r10;
+; ENABLED-NEXT: add.s32 %r26, %r25, %r11;
+; ENABLED-NEXT: add.s32 %r27, %r26, %r12;
+; ENABLED-NEXT: add.s32 %r28, %r27, %r13;
+; ENABLED-NEXT: add.s32 %r29, %r28, %r14;
+; ENABLED-NEXT: add.s32 %r30, %r29, %r15;
+; ENABLED-NEXT: add.s32 %r31, %r30, %r16;
+; ENABLED-NEXT: add.s32 %r32, %r31, %r17;
+; ENABLED-NEXT: add.s32 %r33, %r32, %r18;
+; ENABLED-NEXT: add.s32 %r34, %r33, %r19;
+; ENABLED-NEXT: add.s32 %r35, %r34, %r20;
+; ENABLED-NEXT: add.s32 %r36, %r35, %r21;
+; ENABLED-NEXT: add.s32 %r37, %r36, %r22;
+; ENABLED-NEXT: add.s32 %r38, %r37, %r23;
+; ENABLED-NEXT: add.s32 %r39, %r38, %r24;
+; ENABLED-NEXT: st.u32 [%rd2], %r39;
+; ENABLED-NEXT: ret;
+;
+; DISABLED-LABEL: combine_v16i8(
+; DISABLED: {
+; DISABLED-NEXT: .reg .b32 %r<32>;
+; DISABLED-NEXT: .reg .b64 %rd<3>;
+; DISABLED-EMPTY:
+; DISABLED-NEXT: // %bb.0:
+; DISABLED-NEXT: ld.param.u64 %rd1, [combine_v16i8_param_0];
+; DISABLED-NEXT: ld.u8 %r1, [%rd1];
+; DISABLED-NEXT: ld.param.u64 %rd2, [combine_v16i8_param_1];
+; DISABLED-NEXT: ld.u8 %r2, [%rd1+1];
+; DISABLED-NEXT: ld.u8 %r3, [%rd1+2];
+; DISABLED-NEXT: ld.u8 %r4, [%rd1+3];
+; DISABLED-NEXT: ld.u8 %r5, [%rd1+4];
+; DISABLED-NEXT: ld.u8 %r6, [%rd1+5];
+; DISABLED-NEXT: ld.u8 %r7, [%rd1+6];
+; DISABLED-NEXT: ld.u8 %r8, [%rd1+7];
+; DISABLED-NEXT: ld.u8 %r9, [%rd1+8];
+; DISABLED-NEXT: ld.u8 %r10, [%rd1+9];
+; DISABLED-NEXT: ld.u8 %r11, [%rd1+10];
+; DISABLED-NEXT: ld.u8 %r12, [%rd1+11];
+; DISABLED-NEXT: ld.u8 %r13, [%rd1+12];
+; DISABLED-NEXT: ld.u8 %r14, [%rd1+13];
+; DISABLED-NEXT: ld.u8 %r15, [%rd1+14];
+; DISABLED-NEXT: ld.u8 %r16, [%rd1+15];
+; DISABLED-NEXT: add.s32 %r17, %r1, %r2;
+; DISABLED-NEXT: add.s32 %r18, %r17, %r3;
+; DISABLED-NEXT: add.s32 %r19, %r18, %r4;
+; DISABLED-NEXT: add.s32 %r20, %r19, %r5;
+; DISABLED-NEXT: add.s32 %r21, %r20, %r6;
+; DISABLED-NEXT: add.s32 %r22, %r21, %r7;
+; DISABLED-NEXT: add.s32 %r23, %r22, %r8;
+; DISABLED-NEXT: add.s32 %r24, %r23, %r9;
+; DISABLED-NEXT: add.s32 %r25, %r24, %r10;
+; DISABLED-NEXT: add.s32 %r26, %r25, %r11;
+; DISABLED-NEXT: add.s32 %r27, %r26, %r12;
+; DISABLED-NEXT: add.s32 %r28, %r27, %r13;
+; DISABLED-NEXT: add.s32 %r29, %r28, %r14;
+; DISABLED-NEXT: add.s32 %r30, %r29, %r15;
+; DISABLED-NEXT: add.s32 %r31, %r30, %r16;
+; DISABLED-NEXT: st.u32 [%rd2], %r31;
+; DISABLED-NEXT: ret;
%val0 = load i8, ptr %ptr1, align 16
%ptr1.1 = getelementptr inbounds i8, ptr %ptr1, i64 1
%val1 = load i8, ptr %ptr1.1, align 1
@@ -122,9 +320,219 @@ define void @combine_v16i8(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr
ret void
}
+define void @combine_v16i8_unaligned(ptr noundef align 8 %ptr1, ptr noundef align 16 %ptr2) {
+; ENABLED-LABEL: combine_v16i8_unaligned(
+; ENABLED: {
+; ENABLED-NEXT: .reg .b32 %r<36>;
+; ENABLED-NEXT: .reg .b64 %rd<3>;
+; ENABLED-EMPTY:
+; ENABLED-NEXT: // %bb.0:
+; ENABLED-NEXT: ld.param.u64 %rd1, [combine_v16i8_unaligned_param_0];
+; ENABLED-NEXT: ld.u32 %r1, [%rd1+4];
+; ENABLED-NEXT: ld.u32 %r2, [%rd1];
+; ENABLED-NEXT: ld.param.u64 %rd2, [combine_v16i8_unaligned_param_1];
+; ENABLED-NEXT: ld.u32 %r3, [%rd1+12];
+; ENABLED-NEXT: ld.u32 %r4, [%rd1+8];
+; ENABLED-NEXT: bfe.u32 %r5, %r2, 0, 8;
+; ENABLED-NEXT: bfe.u32 %r6, %r2, 8, 8;
+; ENABLED-NEXT: bfe.u32 %r7, %r2, 16, 8;
+; ENABLED-NEXT: bfe.u32 %r8, %r2, 24, 8;
+; ENABLED-NEXT: bfe.u32 %r9, %r1, 0, 8;
+; ENABLED-NEXT: bfe.u32 %r10, %r1, 8, 8;
+; ENABLED-NEXT: bfe.u32 %r11, %r1, 16, 8;
+; ENABLED-NEXT: bfe.u32 %r12, %r1, 24, 8;
+; ENABLED-NEXT: bfe.u32 %r13, %r4, 0, 8;
+; ENABLED-NEXT: bfe.u32 %r14, %r4, 8, 8;
+; ENABLED-NEXT: bfe.u32 %r15, %r4, 16, 8;
+; ENABLED-NEXT: bfe.u32 %r16, %r4, 24, 8;
+; ENABLED-NEXT: bfe.u32 %r17, %r3, 0, 8;
+; ENABLED-NEXT: bfe.u32 %r18, %r3, 8, 8;
+; ENABLED-NEXT: bfe.u32 %r19, %r3, 16, 8;
+; ENABLED-NEXT: bfe.u32 %r20, %r3, 24, 8;
+; ENABLED-NEXT: add.s32 %r21, %r5, %r6;
+; ENABLED-NEXT: add.s32 %r22, %r21, %r7;
+; ENABLED-NEXT: add.s32 %r23, %r22, %r8;
+; ENABLED-NEXT: add.s32 %r24, %r23, %r9;
+; ENABLED-NEXT: add.s32 %r25, %r24, %r10;
+; ENABLED-NEXT: add.s32 %r26, %r25, %r11;
+; ENABLED-NEXT: add.s32 %r27, %r26, %r12;
+; ENABLED-NEXT: add.s32 %r28, %r27, %r13;
+; ENABLED-NEXT: add.s32 %r29, %r28, %r14;
+; ENABLED-NEXT: add.s32 %r30, %r29, %r15;
+; ENABLED-NEXT: add.s32 %r31, %r30, %r16;
+; ENABLED-NEXT: add.s32 %r32, %r31, %r17;
+; ENABLED-NEXT: add.s32 %r33, %r32, %r18;
+; ENABLED-NEXT: add.s32 %r34, %r33, %r19;
+; ENABLED-NEXT: add.s32 %r35, %r34, %r20;
+; ENABLED-NEXT: st.u32 [%rd2], %r35;
+; ENABLED-NEXT: ret;
+;
+; DISABLED-LABEL: combine_v16i8_unaligned(
+; DISABLED: {
+; DISABLED-NEXT: .reg .b32 %r<32>;
+; DISABLED-NEXT: .reg .b64 %rd<3>;
+; DISABLED-EMPTY:
+; DISABLED-NEXT: // %bb.0:
+; DISABLED-NEXT: ld.param.u64 %rd1, [combine_v16i8_unaligned_param_0];
+; DISABLED-NEXT: ld.u8 %r1, [%rd1];
+; DISABLED-NEXT: ld.param.u64 %rd2, [combine_v16i8_unaligned_param_1];
+; DISABLED-NEXT: ld.u8 %r2, [%rd1+1];
+; DISABLED-NEXT: ld.u8 %r3, [%rd1+2];
+; DISABLED-NEXT: ld.u8 %r4, [%rd1+3];
+; DISABLED-NEXT: ld.u8 %r5, [%rd1+4];
+; DISABLED-NEXT: ld.u8 %r6, [%rd1+5];
+; DISABLED-NEXT: ld.u8 %r7, [%rd1+6];
+; DISABLED-NEXT: ld.u8 %r8, [%rd1+7];
+; DISABLED-NEXT: ld.u8 %r9, [%rd1+8];
+; DISABLED-NEXT: ld.u8 %r10, [%rd1+9];
+; DISABLED-NEXT: ld.u8 %r11, [%rd1+10];
+; DISABLED-NEXT: ld.u8 %r12, [%rd1+11];
+; DISABLED-NEXT: ld.u8 %r13, [%rd1+12];
+; DISABLED-NEXT: ld.u8 %r14, [%rd1+13];
+; DISABLED-NEXT: ld.u8 %r15, [%rd1+14];
+; DISABLED-NEXT: ld.u8 %r16, [%rd1+15];
+; DISABLED-NEXT: add.s32 %r17, %r1, %r2;
+; DISABLED-NEXT: add.s32 %r18, %r17, %r3;
+; DISABLED-NEXT: add.s32 %r19, %r18, %r4;
+; DISABLED-NEXT: add.s32 %r20, %r19, %r5;
+; DISABLED-NEXT: add.s32 %r21, %r20, %r6;
+; DISABLED-NEXT: add.s32 %r22, %r21, %r7;
+; DISABLED-NEXT: add.s32 %r23, %r22, %r8;
+; DISABLED-NEXT: add.s32 %r24, %r23, %r9;
+; DISABLED-NEXT: add.s32 %r25, %r24, %r10;
+; DISABLED-NEXT: add.s32 %r26, %r25, %r11;
+; DISABLED-NEXT: add.s32 %r27, %r26, %r12;
+; DISABLED-NEXT: add.s32 %r28, %r27, %r13;
+; DISABLED-NEXT: add.s32 %r29, %r28, %r14;
+; DISABLED-NEXT: add.s32 %r30, %r29, %r15;
+; DISABLED-NEXT: add.s32 %r31, %r30, %r16;
+; DISABLED-NEXT: st.u32 [%rd2], %r31;
+; DISABLED-NEXT: ret;
+ %val0 = load i8, ptr %ptr1, align 8
+ %ptr1.1 = getelementptr inbounds i8, ptr %ptr1, i64 1
+ %val1 = load i8, ptr %ptr1.1, align 1
+ %ptr1.2 = getelementptr inbounds i8, ptr %ptr1, i64 2
+ %val2 = load i8, ptr %ptr1.2, align 2
+ %ptr1.3 = getelementptr inbounds i8, ptr %ptr1, i64 3
+ %val3 = load i8, ptr %ptr1.3, align 1
+ %ptr1.4 = getelementptr inbounds i8, ptr %ptr1, i64 4
+ %val4 = load i8, ptr %ptr1.4, align 4
+ %ptr1.5 = getelementptr inbounds i8, ptr %ptr1, i64 5
+ %val5 = load i8, ptr %ptr1.5, align 1
+ %ptr1.6 = getelementptr inbounds i8, ptr %ptr1, i64 6
+ %val6 = load i8, ptr %ptr1.6, align 2
+ %ptr1.7 = getelementptr inbounds i8, ptr %ptr1, i64 7
+ %val7 = load i8, ptr %ptr1.7, align 1
+ %ptr1.8 = getelementptr inbounds i8, ptr %ptr1, i64 8
+ %val8 = load i8, ptr %ptr1.8, align 8
+ %ptr1.9 = getelementptr inbounds i8, ptr %ptr1, i64 9
+ %val9 = load i8, ptr %ptr1.9, align 1
+ %ptr1.10 = getelementptr inbounds i8, ptr %ptr1, i64 10
+ %val10 = load i8, ptr %ptr1.10, align 2
+ %ptr1.11 = getelementptr inbounds i8, ptr %ptr1, i64 11
+ %val11 = load i8, ptr %ptr1.11, align 1
+ %ptr1.12 = getelementptr inbounds i8, ptr %ptr1, i64 12
+ %val12 = load i8, ptr %ptr1.12, align 4
+ %ptr1.13 = getelementptr inbounds i8, ptr %ptr1, i64 13
+ %val13 = load i8, ptr %ptr1.13, align 1
+ %ptr1.14 = getelementptr inbounds i8, ptr %ptr1, i64 14
+ %val14 = load i8, ptr %ptr1.14, align 2
+ %ptr1.15 = getelementptr inbounds i8, ptr %ptr1, i64 15
+ %val15 = load i8, ptr %ptr1.15, align 1
+ %lane0 = zext i8 %val0 to i32
+ %lane1 = zext i8 %val1 to i32
+ %lane2 = zext i8 %val2 to i32
+ %lane3 = zext i8 %val3 to i32
+ %lane4 = zext i8 %val4 to i32
+ %lane5 = zext i8 %val5 to i32
+ %lane6 = zext i8 %val6 to i32
+ %lane7 = zext i8 %val7 to i32
+ %lane8 = zext i8 %val8 to i32
+ %lane9 = zext i8 %val9 to i32
+ %lane10 = zext i8 %val10 to i32
+ %lane11 = zext i8 %val11 to i32
+ %lane12 = zext i8 %val12 to i32
+ %lane13 = zext i8 %val13 to i32
+ %lane14 = zext i8 %val14 to i32
+ %lane15 = zext i8 %val15 to i32
+ %red.1 = add i32 %lane0, %lane1
+ %red.2 = add i32 %red.1, %lane2
+ %red.3 = add i32 %red.2, %lane3
+ %red.4 = add i32 %red.3, %lane4
+ %red.5 = add i32 %red.4, %lane5
+ %red.6 = add i32 %red.5, %lane6
+ %red.7 = add i32 %red.6, %lane7
+ %red.8 = add i32 %red.7, %lane8
+ %red.9 = add i32 %red.8, %lane9
+ %red.10 = add i32 %red.9, %lane10
+ %red.11 = add i32 %red.10, %lane11
+ %red.12 = add i32 %red.11, %lane12
+ %red.13 = add i32 %red.12, %lane13
+ %red.14 = add i32 %red.13, %lane14
+ %red = add i32 %red.14, %lane15
+ store i32 %red, ptr %ptr2, align 4
+ ret void
+}
+
+
define void @combine_v8i16(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr2) {
- ; ENABLED-LABEL: combine_v8i16
- ; ENABLED: ld.v4.b32
+; ENABLED-LABEL: combine_v8i16(
+; ENABLED: {
+; ENABLED-NEXT: .reg .b16 %rs<9>;
+; ENABLED-NEXT: .reg .b32 %r<20>;
+; ENABLED-NEXT: .reg .b64 %rd<3>;
+; ENABLED-EMPTY:
+; ENABLED-NEXT: // %bb.0:
+; ENABLED-NEXT: ld.param.u64 %rd1, [combine_v8i16_param_0];
+; ENABLED-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; ENABLED-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; ENABLED-NEXT: mov.b32 {%rs3, %rs4}, %r3;
+; ENABLED-NEXT: mov.b32 {%rs5, %rs6}, %r2;
+; ENABLED-NEXT: mov.b32 {%rs7, %rs8}, %r1;
+; ENABLED-NEXT: ld.param.u64 %rd2, [combine_v8i16_param_1];
+; ENABLED-NEXT: cvt.u32.u16 %r5, %rs7;
+; ENABLED-NEXT: cvt.u32.u16 %r6, %rs8;
+; ENABLED-NEXT: cvt.u32.u16 %r7, %rs5;
+; ENABLED-NEXT: cvt.u32.u16 %r8, %rs6;
+; ENABLED-NEXT: cvt.u32.u16 %r9, %rs3;
+; ENABLED-NEXT: cvt.u32.u16 %r10, %rs4;
+; ENABLED-NEXT: cvt.u32.u16 %r11, %rs1;
+; ENABLED-NEXT: cvt.u32.u16 %r12, %rs2;
+; ENABLED-NEXT: add.s32 %r13, %r5, %r6;
+; ENABLED-NEXT: add.s32 %r14, %r13, %r7;
+; ENABLED-NEXT: add.s32 %r15, %r14, %r8;
+; ENABLED-NEXT: add.s32 %r16, %r15, %r9;
+; ENABLED-NEXT: add.s32 %r17, %r16, %r10;
+; ENABLED-NEXT: add.s32 %r18, %r17, %r11;
+; ENABLED-NEXT: add.s32 %r19, %r18, %r12;
+; ENABLED-NEXT: st.u32 [%rd2], %r19;
+; ENABLED-NEXT: ret;
+;
+; DISABLED-LABEL: combine_v8i16(
+; DISABLED: {
+; DISABLED-NEXT: .reg .b32 %r<16>;
+; DISABLED-NEXT: .reg .b64 %rd<3>;
+; DISABLED-EMPTY:
+; DISABLED-NEXT: // %bb.0:
+; DISABLED-NEXT: ld.param.u64 %rd1, [combine_v8i16_param_0];
+; DISABLED-NEXT: ld.u16 %r1, [%rd1];
+; DISABLED-NEXT: ld.param.u64 %rd2, [combine_v8i16_param_1];
+; DISABLED-NEXT: ld.u16 %r2, [%rd1+2];
+; DISABLED-NEXT: ld.u16 %r3, [%rd1+4];
+; DISABLED-NEXT: ld.u16 %r4, [%rd1+6];
+; DISABLED-NEXT: ld.u16 %r5, [%rd1+8];
+; DISABLED-NEXT: ld.u16 %r6, [%rd1+10];
+; DISABLED-NEXT: ld.u16 %r7, [%rd1+12];
+; DISABLED-NEXT: ld.u16 %r8, [%rd1+14];
+; DISABLED-NEXT: add.s32 %r9, %r1, %r2;
+; DISABLED-NEXT: add.s32 %r10, %r9, %r3;
+; DISABLED-NEXT: add.s32 %r11, %r10, %r4;
+; DISABLED-NEXT: add.s32 %r12, %r11, %r5;
+; DISABLED-NEXT: add.s32 %r13, %r12, %r6;
+; DISABLED-NEXT: add.s32 %r14, %r13, %r7;
+; DISABLED-NEXT: add.s32 %r15, %r14, %r8;
+; DISABLED-NEXT: st.u32 [%rd2], %r15;
+; DISABLED-NEXT: ret;
%val0 = load i16, ptr %ptr1, align 16
%ptr1.1 = getelementptr inbounds i16, ptr %ptr1, i64 1
%val1 = load i16, ptr %ptr1.1, align 2
@@ -160,8 +568,38 @@ define void @combine_v8i16(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr
}
define void @combine_v4i32(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr2) {
- ; ENABLED-LABEL: combine_v4i32
- ; ENABLED: ld.v4.u32
+; ENABLED-LABEL: combine_v4i32(
+; ENABLED: {
+; ENABLED-NEXT: .reg .b32 %r<8>;
+; ENABLED-NEXT: .reg .b64 %rd<3>;
+; ENABLED-EMPTY:
+; ENABLED-NEXT: // %bb.0:
+; ENABLED-NEXT: ld....
[truncated]
|
Original v16i8 combining code was introduced in #67322 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, Thanks!
Fixes generation of invalid loads leading to misaligned access errors.
The bug got exposed by SLP vectorizer change ec360d6 which allowed SLP to produce
v16i8
vectors.Also updated the tests to use automatic check generator.