Skip to content

[NVPTX] Restrict combining to properly aligned v16i8 vectors. #107919

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 9, 2024

Conversation

Artem-B
Copy link
Member

@Artem-B Artem-B commented Sep 9, 2024

Fixes generation of invalid loads leading to misaligned access errors.
The bug got exposed by SLP vectorizer change ec360d6 which allowed SLP to produce v16i8 vectors.

Also updated the tests to use automatic check generator.

Fixes generation of invalid loads leading to misaligned access errors.

Update tests to use automatic check generator.
@llvmbot
Copy link
Member

llvmbot commented Sep 9, 2024

@llvm/pr-subscribers-backend-nvptx

Author: Artem Belevich (Artem-B)

Changes

Fixes generation of invalid loads leading to misaligned access errors.
The bug got exposed by SLP vectorizer change ec360d6 which allowed SLP to produce v16i8 vectors.

Also updated the tests to use automatic check generator.


Patch is 21.16 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/107919.diff

2 Files Affected:

  • (modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp (+5-1)
  • (modified) llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll (+447-9)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index bb76ffdfd99d7b..5c5766a8b23455 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -6038,7 +6038,11 @@ static SDValue PerformLOADCombine(SDNode *N,
   // elements can be optimised away instead of being needlessly split during
   // legalization, which involves storing to the stack and loading it back.
   EVT VT = N->getValueType(0);
-  if (VT != MVT::v16i8)
+  bool CorrectlyAligned =
+      DCI.DAG.getTargetLoweringInfo().allowsMemoryAccessForAlignment(
+          *DAG.getContext(), DAG.getDataLayout(), LD->getMemoryVT(),
+          *LD->getMemOperand());
+  if (!(VT == MVT::v16i8 && CorrectlyAligned))
     return SDValue();
 
   SDLoc DL(N);
diff --git a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
index 868a06e2a850cc..bc58a700cb9828 100644
--- a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
+++ b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s | FileCheck -check-prefix=ENABLED %s
 ; RUN: llc -disable-nvptx-load-store-vectorizer < %s | FileCheck -check-prefix=DISABLED %s
 ; RUN: %if ptxas %{ llc < %s | %ptxas-verify %}
@@ -8,10 +9,31 @@ target triple = "nvptx64-nvidia-cuda"
 ; Check that the load-store vectorizer is enabled by default for nvptx, and
 ; that it's disabled by the appropriate flag.
 
-; ENABLED: ld.v2.{{.}}32
-; DISABLED: ld.{{.}}32
-; DISABLED: ld.{{.}}32
 define i32 @f(ptr %p) {
+; ENABLED-LABEL: f(
+; ENABLED:       {
+; ENABLED-NEXT:    .reg .b32 %r<4>;
+; ENABLED-NEXT:    .reg .b64 %rd<2>;
+; ENABLED-EMPTY:
+; ENABLED-NEXT:  // %bb.0:
+; ENABLED-NEXT:    ld.param.u64 %rd1, [f_param_0];
+; ENABLED-NEXT:    ld.v2.u32 {%r1, %r2}, [%rd1];
+; ENABLED-NEXT:    add.s32 %r3, %r1, %r2;
+; ENABLED-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; ENABLED-NEXT:    ret;
+;
+; DISABLED-LABEL: f(
+; DISABLED:       {
+; DISABLED-NEXT:    .reg .b32 %r<4>;
+; DISABLED-NEXT:    .reg .b64 %rd<2>;
+; DISABLED-EMPTY:
+; DISABLED-NEXT:  // %bb.0:
+; DISABLED-NEXT:    ld.param.u64 %rd1, [f_param_0];
+; DISABLED-NEXT:    ld.u32 %r1, [%rd1];
+; DISABLED-NEXT:    ld.u32 %r2, [%rd1+4];
+; DISABLED-NEXT:    add.s32 %r3, %r1, %r2;
+; DISABLED-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; DISABLED-NEXT:    ret;
   %p.1 = getelementptr i32, ptr %p, i32 1
   %v0 = load i32, ptr %p, align 8
   %v1 = load i32, ptr %p.1, align 4
@@ -20,6 +42,66 @@ define i32 @f(ptr %p) {
 }
 
 define half @fh(ptr %p) {
+; ENABLED-LABEL: fh(
+; ENABLED:       {
+; ENABLED-NEXT:    .reg .b16 %rs<10>;
+; ENABLED-NEXT:    .reg .f32 %f<13>;
+; ENABLED-NEXT:    .reg .b64 %rd<2>;
+; ENABLED-EMPTY:
+; ENABLED-NEXT:  // %bb.0:
+; ENABLED-NEXT:    ld.param.u64 %rd1, [fh_param_0];
+; ENABLED-NEXT:    ld.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
+; ENABLED-NEXT:    ld.b16 %rs5, [%rd1+8];
+; ENABLED-NEXT:    cvt.f32.f16 %f1, %rs2;
+; ENABLED-NEXT:    cvt.f32.f16 %f2, %rs1;
+; ENABLED-NEXT:    add.rn.f32 %f3, %f2, %f1;
+; ENABLED-NEXT:    cvt.rn.f16.f32 %rs6, %f3;
+; ENABLED-NEXT:    cvt.f32.f16 %f4, %rs4;
+; ENABLED-NEXT:    cvt.f32.f16 %f5, %rs3;
+; ENABLED-NEXT:    add.rn.f32 %f6, %f5, %f4;
+; ENABLED-NEXT:    cvt.rn.f16.f32 %rs7, %f6;
+; ENABLED-NEXT:    cvt.f32.f16 %f7, %rs7;
+; ENABLED-NEXT:    cvt.f32.f16 %f8, %rs6;
+; ENABLED-NEXT:    add.rn.f32 %f9, %f8, %f7;
+; ENABLED-NEXT:    cvt.rn.f16.f32 %rs8, %f9;
+; ENABLED-NEXT:    cvt.f32.f16 %f10, %rs8;
+; ENABLED-NEXT:    cvt.f32.f16 %f11, %rs5;
+; ENABLED-NEXT:    add.rn.f32 %f12, %f10, %f11;
+; ENABLED-NEXT:    cvt.rn.f16.f32 %rs9, %f12;
+; ENABLED-NEXT:    st.param.b16 [func_retval0+0], %rs9;
+; ENABLED-NEXT:    ret;
+;
+; DISABLED-LABEL: fh(
+; DISABLED:       {
+; DISABLED-NEXT:    .reg .b16 %rs<10>;
+; DISABLED-NEXT:    .reg .f32 %f<13>;
+; DISABLED-NEXT:    .reg .b64 %rd<2>;
+; DISABLED-EMPTY:
+; DISABLED-NEXT:  // %bb.0:
+; DISABLED-NEXT:    ld.param.u64 %rd1, [fh_param_0];
+; DISABLED-NEXT:    ld.b16 %rs1, [%rd1];
+; DISABLED-NEXT:    ld.b16 %rs2, [%rd1+2];
+; DISABLED-NEXT:    ld.b16 %rs3, [%rd1+4];
+; DISABLED-NEXT:    ld.b16 %rs4, [%rd1+6];
+; DISABLED-NEXT:    ld.b16 %rs5, [%rd1+8];
+; DISABLED-NEXT:    cvt.f32.f16 %f1, %rs2;
+; DISABLED-NEXT:    cvt.f32.f16 %f2, %rs1;
+; DISABLED-NEXT:    add.rn.f32 %f3, %f2, %f1;
+; DISABLED-NEXT:    cvt.rn.f16.f32 %rs6, %f3;
+; DISABLED-NEXT:    cvt.f32.f16 %f4, %rs4;
+; DISABLED-NEXT:    cvt.f32.f16 %f5, %rs3;
+; DISABLED-NEXT:    add.rn.f32 %f6, %f5, %f4;
+; DISABLED-NEXT:    cvt.rn.f16.f32 %rs7, %f6;
+; DISABLED-NEXT:    cvt.f32.f16 %f7, %rs7;
+; DISABLED-NEXT:    cvt.f32.f16 %f8, %rs6;
+; DISABLED-NEXT:    add.rn.f32 %f9, %f8, %f7;
+; DISABLED-NEXT:    cvt.rn.f16.f32 %rs8, %f9;
+; DISABLED-NEXT:    cvt.f32.f16 %f10, %rs8;
+; DISABLED-NEXT:    cvt.f32.f16 %f11, %rs5;
+; DISABLED-NEXT:    add.rn.f32 %f12, %f10, %f11;
+; DISABLED-NEXT:    cvt.rn.f16.f32 %rs9, %f12;
+; DISABLED-NEXT:    st.param.b16 [func_retval0+0], %rs9;
+; DISABLED-NEXT:    ret;
   %p.1 = getelementptr half, ptr %p, i32 1
   %p.2 = getelementptr half, ptr %p, i32 2
   %p.3 = getelementptr half, ptr %p, i32 3
@@ -37,6 +119,40 @@ define half @fh(ptr %p) {
 }
 
 define float @ff(ptr %p) {
+; ENABLED-LABEL: ff(
+; ENABLED:       {
+; ENABLED-NEXT:    .reg .f32 %f<10>;
+; ENABLED-NEXT:    .reg .b64 %rd<2>;
+; ENABLED-EMPTY:
+; ENABLED-NEXT:  // %bb.0:
+; ENABLED-NEXT:    ld.param.u64 %rd1, [ff_param_0];
+; ENABLED-NEXT:    ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
+; ENABLED-NEXT:    ld.f32 %f5, [%rd1+16];
+; ENABLED-NEXT:    add.rn.f32 %f6, %f1, %f2;
+; ENABLED-NEXT:    add.rn.f32 %f7, %f3, %f4;
+; ENABLED-NEXT:    add.rn.f32 %f8, %f6, %f7;
+; ENABLED-NEXT:    add.rn.f32 %f9, %f8, %f5;
+; ENABLED-NEXT:    st.param.f32 [func_retval0+0], %f9;
+; ENABLED-NEXT:    ret;
+;
+; DISABLED-LABEL: ff(
+; DISABLED:       {
+; DISABLED-NEXT:    .reg .f32 %f<10>;
+; DISABLED-NEXT:    .reg .b64 %rd<2>;
+; DISABLED-EMPTY:
+; DISABLED-NEXT:  // %bb.0:
+; DISABLED-NEXT:    ld.param.u64 %rd1, [ff_param_0];
+; DISABLED-NEXT:    ld.f32 %f1, [%rd1];
+; DISABLED-NEXT:    ld.f32 %f2, [%rd1+4];
+; DISABLED-NEXT:    ld.f32 %f3, [%rd1+8];
+; DISABLED-NEXT:    ld.f32 %f4, [%rd1+12];
+; DISABLED-NEXT:    ld.f32 %f5, [%rd1+16];
+; DISABLED-NEXT:    add.rn.f32 %f6, %f1, %f2;
+; DISABLED-NEXT:    add.rn.f32 %f7, %f3, %f4;
+; DISABLED-NEXT:    add.rn.f32 %f8, %f6, %f7;
+; DISABLED-NEXT:    add.rn.f32 %f9, %f8, %f5;
+; DISABLED-NEXT:    st.param.f32 [func_retval0+0], %f9;
+; DISABLED-NEXT:    ret;
   %p.1 = getelementptr float, ptr %p, i32 1
   %p.2 = getelementptr float, ptr %p, i32 2
   %p.3 = getelementptr float, ptr %p, i32 3
@@ -54,8 +170,90 @@ define float @ff(ptr %p) {
 }
 
 define void @combine_v16i8(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr2) {
-  ; ENABLED-LABEL: combine_v16i8
-  ; ENABLED: ld.v4.u32
+; ENABLED-LABEL: combine_v16i8(
+; ENABLED:       {
+; ENABLED-NEXT:    .reg .b32 %r<40>;
+; ENABLED-NEXT:    .reg .b64 %rd<3>;
+; ENABLED-EMPTY:
+; ENABLED-NEXT:  // %bb.0:
+; ENABLED-NEXT:    ld.param.u64 %rd1, [combine_v16i8_param_0];
+; ENABLED-NEXT:    ld.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
+; ENABLED-NEXT:    ld.param.u64 %rd2, [combine_v16i8_param_1];
+; ENABLED-NEXT:    bfe.u32 %r9, %r1, 0, 8;
+; ENABLED-NEXT:    bfe.u32 %r10, %r1, 8, 8;
+; ENABLED-NEXT:    bfe.u32 %r11, %r1, 16, 8;
+; ENABLED-NEXT:    bfe.u32 %r12, %r1, 24, 8;
+; ENABLED-NEXT:    bfe.u32 %r13, %r2, 0, 8;
+; ENABLED-NEXT:    bfe.u32 %r14, %r2, 8, 8;
+; ENABLED-NEXT:    bfe.u32 %r15, %r2, 16, 8;
+; ENABLED-NEXT:    bfe.u32 %r16, %r2, 24, 8;
+; ENABLED-NEXT:    bfe.u32 %r17, %r3, 0, 8;
+; ENABLED-NEXT:    bfe.u32 %r18, %r3, 8, 8;
+; ENABLED-NEXT:    bfe.u32 %r19, %r3, 16, 8;
+; ENABLED-NEXT:    bfe.u32 %r20, %r3, 24, 8;
+; ENABLED-NEXT:    bfe.u32 %r21, %r4, 0, 8;
+; ENABLED-NEXT:    bfe.u32 %r22, %r4, 8, 8;
+; ENABLED-NEXT:    bfe.u32 %r23, %r4, 16, 8;
+; ENABLED-NEXT:    bfe.u32 %r24, %r4, 24, 8;
+; ENABLED-NEXT:    add.s32 %r25, %r9, %r10;
+; ENABLED-NEXT:    add.s32 %r26, %r25, %r11;
+; ENABLED-NEXT:    add.s32 %r27, %r26, %r12;
+; ENABLED-NEXT:    add.s32 %r28, %r27, %r13;
+; ENABLED-NEXT:    add.s32 %r29, %r28, %r14;
+; ENABLED-NEXT:    add.s32 %r30, %r29, %r15;
+; ENABLED-NEXT:    add.s32 %r31, %r30, %r16;
+; ENABLED-NEXT:    add.s32 %r32, %r31, %r17;
+; ENABLED-NEXT:    add.s32 %r33, %r32, %r18;
+; ENABLED-NEXT:    add.s32 %r34, %r33, %r19;
+; ENABLED-NEXT:    add.s32 %r35, %r34, %r20;
+; ENABLED-NEXT:    add.s32 %r36, %r35, %r21;
+; ENABLED-NEXT:    add.s32 %r37, %r36, %r22;
+; ENABLED-NEXT:    add.s32 %r38, %r37, %r23;
+; ENABLED-NEXT:    add.s32 %r39, %r38, %r24;
+; ENABLED-NEXT:    st.u32 [%rd2], %r39;
+; ENABLED-NEXT:    ret;
+;
+; DISABLED-LABEL: combine_v16i8(
+; DISABLED:       {
+; DISABLED-NEXT:    .reg .b32 %r<32>;
+; DISABLED-NEXT:    .reg .b64 %rd<3>;
+; DISABLED-EMPTY:
+; DISABLED-NEXT:  // %bb.0:
+; DISABLED-NEXT:    ld.param.u64 %rd1, [combine_v16i8_param_0];
+; DISABLED-NEXT:    ld.u8 %r1, [%rd1];
+; DISABLED-NEXT:    ld.param.u64 %rd2, [combine_v16i8_param_1];
+; DISABLED-NEXT:    ld.u8 %r2, [%rd1+1];
+; DISABLED-NEXT:    ld.u8 %r3, [%rd1+2];
+; DISABLED-NEXT:    ld.u8 %r4, [%rd1+3];
+; DISABLED-NEXT:    ld.u8 %r5, [%rd1+4];
+; DISABLED-NEXT:    ld.u8 %r6, [%rd1+5];
+; DISABLED-NEXT:    ld.u8 %r7, [%rd1+6];
+; DISABLED-NEXT:    ld.u8 %r8, [%rd1+7];
+; DISABLED-NEXT:    ld.u8 %r9, [%rd1+8];
+; DISABLED-NEXT:    ld.u8 %r10, [%rd1+9];
+; DISABLED-NEXT:    ld.u8 %r11, [%rd1+10];
+; DISABLED-NEXT:    ld.u8 %r12, [%rd1+11];
+; DISABLED-NEXT:    ld.u8 %r13, [%rd1+12];
+; DISABLED-NEXT:    ld.u8 %r14, [%rd1+13];
+; DISABLED-NEXT:    ld.u8 %r15, [%rd1+14];
+; DISABLED-NEXT:    ld.u8 %r16, [%rd1+15];
+; DISABLED-NEXT:    add.s32 %r17, %r1, %r2;
+; DISABLED-NEXT:    add.s32 %r18, %r17, %r3;
+; DISABLED-NEXT:    add.s32 %r19, %r18, %r4;
+; DISABLED-NEXT:    add.s32 %r20, %r19, %r5;
+; DISABLED-NEXT:    add.s32 %r21, %r20, %r6;
+; DISABLED-NEXT:    add.s32 %r22, %r21, %r7;
+; DISABLED-NEXT:    add.s32 %r23, %r22, %r8;
+; DISABLED-NEXT:    add.s32 %r24, %r23, %r9;
+; DISABLED-NEXT:    add.s32 %r25, %r24, %r10;
+; DISABLED-NEXT:    add.s32 %r26, %r25, %r11;
+; DISABLED-NEXT:    add.s32 %r27, %r26, %r12;
+; DISABLED-NEXT:    add.s32 %r28, %r27, %r13;
+; DISABLED-NEXT:    add.s32 %r29, %r28, %r14;
+; DISABLED-NEXT:    add.s32 %r30, %r29, %r15;
+; DISABLED-NEXT:    add.s32 %r31, %r30, %r16;
+; DISABLED-NEXT:    st.u32 [%rd2], %r31;
+; DISABLED-NEXT:    ret;
   %val0 = load i8, ptr %ptr1, align 16
   %ptr1.1 = getelementptr inbounds i8, ptr %ptr1, i64 1
   %val1 = load i8, ptr %ptr1.1, align 1
@@ -122,9 +320,219 @@ define void @combine_v16i8(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr
   ret void
 }
 
+define void @combine_v16i8_unaligned(ptr noundef align 8 %ptr1, ptr noundef align 16 %ptr2) {
+; ENABLED-LABEL: combine_v16i8_unaligned(
+; ENABLED:       {
+; ENABLED-NEXT:    .reg .b32 %r<36>;
+; ENABLED-NEXT:    .reg .b64 %rd<3>;
+; ENABLED-EMPTY:
+; ENABLED-NEXT:  // %bb.0:
+; ENABLED-NEXT:    ld.param.u64 %rd1, [combine_v16i8_unaligned_param_0];
+; ENABLED-NEXT:    ld.u32 %r1, [%rd1+4];
+; ENABLED-NEXT:    ld.u32 %r2, [%rd1];
+; ENABLED-NEXT:    ld.param.u64 %rd2, [combine_v16i8_unaligned_param_1];
+; ENABLED-NEXT:    ld.u32 %r3, [%rd1+12];
+; ENABLED-NEXT:    ld.u32 %r4, [%rd1+8];
+; ENABLED-NEXT:    bfe.u32 %r5, %r2, 0, 8;
+; ENABLED-NEXT:    bfe.u32 %r6, %r2, 8, 8;
+; ENABLED-NEXT:    bfe.u32 %r7, %r2, 16, 8;
+; ENABLED-NEXT:    bfe.u32 %r8, %r2, 24, 8;
+; ENABLED-NEXT:    bfe.u32 %r9, %r1, 0, 8;
+; ENABLED-NEXT:    bfe.u32 %r10, %r1, 8, 8;
+; ENABLED-NEXT:    bfe.u32 %r11, %r1, 16, 8;
+; ENABLED-NEXT:    bfe.u32 %r12, %r1, 24, 8;
+; ENABLED-NEXT:    bfe.u32 %r13, %r4, 0, 8;
+; ENABLED-NEXT:    bfe.u32 %r14, %r4, 8, 8;
+; ENABLED-NEXT:    bfe.u32 %r15, %r4, 16, 8;
+; ENABLED-NEXT:    bfe.u32 %r16, %r4, 24, 8;
+; ENABLED-NEXT:    bfe.u32 %r17, %r3, 0, 8;
+; ENABLED-NEXT:    bfe.u32 %r18, %r3, 8, 8;
+; ENABLED-NEXT:    bfe.u32 %r19, %r3, 16, 8;
+; ENABLED-NEXT:    bfe.u32 %r20, %r3, 24, 8;
+; ENABLED-NEXT:    add.s32 %r21, %r5, %r6;
+; ENABLED-NEXT:    add.s32 %r22, %r21, %r7;
+; ENABLED-NEXT:    add.s32 %r23, %r22, %r8;
+; ENABLED-NEXT:    add.s32 %r24, %r23, %r9;
+; ENABLED-NEXT:    add.s32 %r25, %r24, %r10;
+; ENABLED-NEXT:    add.s32 %r26, %r25, %r11;
+; ENABLED-NEXT:    add.s32 %r27, %r26, %r12;
+; ENABLED-NEXT:    add.s32 %r28, %r27, %r13;
+; ENABLED-NEXT:    add.s32 %r29, %r28, %r14;
+; ENABLED-NEXT:    add.s32 %r30, %r29, %r15;
+; ENABLED-NEXT:    add.s32 %r31, %r30, %r16;
+; ENABLED-NEXT:    add.s32 %r32, %r31, %r17;
+; ENABLED-NEXT:    add.s32 %r33, %r32, %r18;
+; ENABLED-NEXT:    add.s32 %r34, %r33, %r19;
+; ENABLED-NEXT:    add.s32 %r35, %r34, %r20;
+; ENABLED-NEXT:    st.u32 [%rd2], %r35;
+; ENABLED-NEXT:    ret;
+;
+; DISABLED-LABEL: combine_v16i8_unaligned(
+; DISABLED:       {
+; DISABLED-NEXT:    .reg .b32 %r<32>;
+; DISABLED-NEXT:    .reg .b64 %rd<3>;
+; DISABLED-EMPTY:
+; DISABLED-NEXT:  // %bb.0:
+; DISABLED-NEXT:    ld.param.u64 %rd1, [combine_v16i8_unaligned_param_0];
+; DISABLED-NEXT:    ld.u8 %r1, [%rd1];
+; DISABLED-NEXT:    ld.param.u64 %rd2, [combine_v16i8_unaligned_param_1];
+; DISABLED-NEXT:    ld.u8 %r2, [%rd1+1];
+; DISABLED-NEXT:    ld.u8 %r3, [%rd1+2];
+; DISABLED-NEXT:    ld.u8 %r4, [%rd1+3];
+; DISABLED-NEXT:    ld.u8 %r5, [%rd1+4];
+; DISABLED-NEXT:    ld.u8 %r6, [%rd1+5];
+; DISABLED-NEXT:    ld.u8 %r7, [%rd1+6];
+; DISABLED-NEXT:    ld.u8 %r8, [%rd1+7];
+; DISABLED-NEXT:    ld.u8 %r9, [%rd1+8];
+; DISABLED-NEXT:    ld.u8 %r10, [%rd1+9];
+; DISABLED-NEXT:    ld.u8 %r11, [%rd1+10];
+; DISABLED-NEXT:    ld.u8 %r12, [%rd1+11];
+; DISABLED-NEXT:    ld.u8 %r13, [%rd1+12];
+; DISABLED-NEXT:    ld.u8 %r14, [%rd1+13];
+; DISABLED-NEXT:    ld.u8 %r15, [%rd1+14];
+; DISABLED-NEXT:    ld.u8 %r16, [%rd1+15];
+; DISABLED-NEXT:    add.s32 %r17, %r1, %r2;
+; DISABLED-NEXT:    add.s32 %r18, %r17, %r3;
+; DISABLED-NEXT:    add.s32 %r19, %r18, %r4;
+; DISABLED-NEXT:    add.s32 %r20, %r19, %r5;
+; DISABLED-NEXT:    add.s32 %r21, %r20, %r6;
+; DISABLED-NEXT:    add.s32 %r22, %r21, %r7;
+; DISABLED-NEXT:    add.s32 %r23, %r22, %r8;
+; DISABLED-NEXT:    add.s32 %r24, %r23, %r9;
+; DISABLED-NEXT:    add.s32 %r25, %r24, %r10;
+; DISABLED-NEXT:    add.s32 %r26, %r25, %r11;
+; DISABLED-NEXT:    add.s32 %r27, %r26, %r12;
+; DISABLED-NEXT:    add.s32 %r28, %r27, %r13;
+; DISABLED-NEXT:    add.s32 %r29, %r28, %r14;
+; DISABLED-NEXT:    add.s32 %r30, %r29, %r15;
+; DISABLED-NEXT:    add.s32 %r31, %r30, %r16;
+; DISABLED-NEXT:    st.u32 [%rd2], %r31;
+; DISABLED-NEXT:    ret;
+  %val0 = load i8, ptr %ptr1, align 8
+  %ptr1.1 = getelementptr inbounds i8, ptr %ptr1, i64 1
+  %val1 = load i8, ptr %ptr1.1, align 1
+  %ptr1.2 = getelementptr inbounds i8, ptr %ptr1, i64 2
+  %val2 = load i8, ptr %ptr1.2, align 2
+  %ptr1.3 = getelementptr inbounds i8, ptr %ptr1, i64 3
+  %val3 = load i8, ptr %ptr1.3, align 1
+  %ptr1.4 = getelementptr inbounds i8, ptr %ptr1, i64 4
+  %val4 = load i8, ptr %ptr1.4, align 4
+  %ptr1.5 = getelementptr inbounds i8, ptr %ptr1, i64 5
+  %val5 = load i8, ptr %ptr1.5, align 1
+  %ptr1.6 = getelementptr inbounds i8, ptr %ptr1, i64 6
+  %val6 = load i8, ptr %ptr1.6, align 2
+  %ptr1.7 = getelementptr inbounds i8, ptr %ptr1, i64 7
+  %val7 = load i8, ptr %ptr1.7, align 1
+  %ptr1.8 = getelementptr inbounds i8, ptr %ptr1, i64 8
+  %val8 = load i8, ptr %ptr1.8, align 8
+  %ptr1.9 = getelementptr inbounds i8, ptr %ptr1, i64 9
+  %val9 = load i8, ptr %ptr1.9, align 1
+  %ptr1.10 = getelementptr inbounds i8, ptr %ptr1, i64 10
+  %val10 = load i8, ptr %ptr1.10, align 2
+  %ptr1.11 = getelementptr inbounds i8, ptr %ptr1, i64 11
+  %val11 = load i8, ptr %ptr1.11, align 1
+  %ptr1.12 = getelementptr inbounds i8, ptr %ptr1, i64 12
+  %val12 = load i8, ptr %ptr1.12, align 4
+  %ptr1.13 = getelementptr inbounds i8, ptr %ptr1, i64 13
+  %val13 = load i8, ptr %ptr1.13, align 1
+  %ptr1.14 = getelementptr inbounds i8, ptr %ptr1, i64 14
+  %val14 = load i8, ptr %ptr1.14, align 2
+  %ptr1.15 = getelementptr inbounds i8, ptr %ptr1, i64 15
+  %val15 = load i8, ptr %ptr1.15, align 1
+  %lane0 = zext i8 %val0 to i32
+  %lane1 = zext i8 %val1 to i32
+  %lane2 = zext i8 %val2 to i32
+  %lane3 = zext i8 %val3 to i32
+  %lane4 = zext i8 %val4 to i32
+  %lane5 = zext i8 %val5 to i32
+  %lane6 = zext i8 %val6 to i32
+  %lane7 = zext i8 %val7 to i32
+  %lane8 = zext i8 %val8 to i32
+  %lane9 = zext i8 %val9 to i32
+  %lane10 = zext i8 %val10 to i32
+  %lane11 = zext i8 %val11 to i32
+  %lane12 = zext i8 %val12 to i32
+  %lane13 = zext i8 %val13 to i32
+  %lane14 = zext i8 %val14 to i32
+  %lane15 = zext i8 %val15 to i32
+  %red.1 = add i32 %lane0, %lane1
+  %red.2 = add i32 %red.1, %lane2
+  %red.3 = add i32 %red.2, %lane3
+  %red.4 = add i32 %red.3, %lane4
+  %red.5 = add i32 %red.4, %lane5
+  %red.6 = add i32 %red.5, %lane6
+  %red.7 = add i32 %red.6, %lane7
+  %red.8 = add i32 %red.7, %lane8
+  %red.9 = add i32 %red.8, %lane9
+  %red.10 = add i32 %red.9, %lane10
+  %red.11 = add i32 %red.10, %lane11
+  %red.12 = add i32 %red.11, %lane12
+  %red.13 = add i32 %red.12, %lane13
+  %red.14 = add i32 %red.13, %lane14
+  %red = add i32 %red.14, %lane15
+  store i32 %red, ptr %ptr2, align 4
+  ret void
+}
+
+
 define void @combine_v8i16(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr2) {
-  ; ENABLED-LABEL: combine_v8i16
-  ; ENABLED: ld.v4.b32
+; ENABLED-LABEL: combine_v8i16(
+; ENABLED:       {
+; ENABLED-NEXT:    .reg .b16 %rs<9>;
+; ENABLED-NEXT:    .reg .b32 %r<20>;
+; ENABLED-NEXT:    .reg .b64 %rd<3>;
+; ENABLED-EMPTY:
+; ENABLED-NEXT:  // %bb.0:
+; ENABLED-NEXT:    ld.param.u64 %rd1, [combine_v8i16_param_0];
+; ENABLED-NEXT:    ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; ENABLED-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
+; ENABLED-NEXT:    mov.b32 {%rs3, %rs4}, %r3;
+; ENABLED-NEXT:    mov.b32 {%rs5, %rs6}, %r2;
+; ENABLED-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
+; ENABLED-NEXT:    ld.param.u64 %rd2, [combine_v8i16_param_1];
+; ENABLED-NEXT:    cvt.u32.u16 %r5, %rs7;
+; ENABLED-NEXT:    cvt.u32.u16 %r6, %rs8;
+; ENABLED-NEXT:    cvt.u32.u16 %r7, %rs5;
+; ENABLED-NEXT:    cvt.u32.u16 %r8, %rs6;
+; ENABLED-NEXT:    cvt.u32.u16 %r9, %rs3;
+; ENABLED-NEXT:    cvt.u32.u16 %r10, %rs4;
+; ENABLED-NEXT:    cvt.u32.u16 %r11, %rs1;
+; ENABLED-NEXT:    cvt.u32.u16 %r12, %rs2;
+; ENABLED-NEXT:    add.s32 %r13, %r5, %r6;
+; ENABLED-NEXT:    add.s32 %r14, %r13, %r7;
+; ENABLED-NEXT:    add.s32 %r15, %r14, %r8;
+; ENABLED-NEXT:    add.s32 %r16, %r15, %r9;
+; ENABLED-NEXT:    add.s32 %r17, %r16, %r10;
+; ENABLED-NEXT:    add.s32 %r18, %r17, %r11;
+; ENABLED-NEXT:    add.s32 %r19, %r18, %r12;
+; ENABLED-NEXT:    st.u32 [%rd2], %r19;
+; ENABLED-NEXT:    ret;
+;
+; DISABLED-LABEL: combine_v8i16(
+; DISABLED:       {
+; DISABLED-NEXT:    .reg .b32 %r<16>;
+; DISABLED-NEXT:    .reg .b64 %rd<3>;
+; DISABLED-EMPTY:
+; DISABLED-NEXT:  // %bb.0:
+; DISABLED-NEXT:    ld.param.u64 %rd1, [combine_v8i16_param_0];
+; DISABLED-NEXT:    ld.u16 %r1, [%rd1];
+; DISABLED-NEXT:    ld.param.u64 %rd2, [combine_v8i16_param_1];
+; DISABLED-NEXT:    ld.u16 %r2, [%rd1+2];
+; DISABLED-NEXT:    ld.u16 %r3, [%rd1+4];
+; DISABLED-NEXT:    ld.u16 %r4, [%rd1+6];
+; DISABLED-NEXT:    ld.u16 %r5, [%rd1+8];
+; DISABLED-NEXT:    ld.u16 %r6, [%rd1+10];
+; DISABLED-NEXT:    ld.u16 %r7, [%rd1+12];
+; DISABLED-NEXT:    ld.u16 %r8, [%rd1+14];
+; DISABLED-NEXT:    add.s32 %r9, %r1, %r2;
+; DISABLED-NEXT:    add.s32 %r10, %r9, %r3;
+; DISABLED-NEXT:    add.s32 %r11, %r10, %r4;
+; DISABLED-NEXT:    add.s32 %r12, %r11, %r5;
+; DISABLED-NEXT:    add.s32 %r13, %r12, %r6;
+; DISABLED-NEXT:    add.s32 %r14, %r13, %r7;
+; DISABLED-NEXT:    add.s32 %r15, %r14, %r8;
+; DISABLED-NEXT:    st.u32 [%rd2], %r15;
+; DISABLED-NEXT:    ret;
   %val0 = load i16, ptr %ptr1, align 16
   %ptr1.1 = getelementptr inbounds i16, ptr %ptr1, i64 1
   %val1 = load i16, ptr %ptr1.1, align 2
@@ -160,8 +568,38 @@ define void @combine_v8i16(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr
 }
 
 define void @combine_v4i32(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr2) {
-  ; ENABLED-LABEL: combine_v4i32
-  ; ENABLED: ld.v4.u32
+; ENABLED-LABEL: combine_v4i32(
+; ENABLED:       {
+; ENABLED-NEXT:    .reg .b32 %r<8>;
+; ENABLED-NEXT:    .reg .b64 %rd<3>;
+; ENABLED-EMPTY:
+; ENABLED-NEXT:  // %bb.0:
+; ENABLED-NEXT:    ld....
[truncated]

@Artem-B
Copy link
Member Author

Artem-B commented Sep 9, 2024

Original v16i8 combining code was introduced in #67322

Copy link
Member

@AlexMaclean AlexMaclean left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM, Thanks!

@Artem-B Artem-B merged commit 26b786a into llvm:main Sep 9, 2024
8 of 9 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants