Skip to content

[AArch64][SVE] Lower unpredicated loads/stores as fixed LDR/STR with -msve-vector-bits=128. #127500

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23654,6 +23654,29 @@ static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
return DAG.getMergeValues({Extract, TokenFactor}, DL);
}

// Replace packed scalable loads with fixed loads when vscale_range(1, 1).
// This enables further optimisations such as LDP folds.
static SDValue combineVScale1Load(LoadSDNode *LD, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
EVT MemVT = LD->getMemoryVT();
if (!DCI.isBeforeLegalize() || !Subtarget->isLittleEndian() ||
!Subtarget->hasNEON() || !MemVT.isScalableVector() ||
LD->getExtensionType() != ISD::NON_EXTLOAD ||
MemVT.getSizeInBits().getKnownMinValue() != 128 ||
Subtarget->getMaxSVEVectorSizeInBits() != 128)
return SDValue();

SDLoc DL(LD);
EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(),
MemVT.getVectorMinNumElements());
SDValue NewLoad = DAG.getLoad(
NewVT, DL, LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
LD->getOriginalAlign(), LD->getMemOperand()->getFlags(), LD->getAAInfo());
SDValue Insert = convertToScalableVector(DAG, MemVT, NewLoad);
return DAG.getMergeValues({Insert, NewLoad.getValue(1)}, DL);
}

// Perform TBI simplification if supported by the target and try to break up
// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
// load instructions can be selected.
Expand Down Expand Up @@ -23691,6 +23714,9 @@ static SDValue performLOADCombine(SDNode *N,
if (SDValue Res = combineV3I8LoadExt(LD, DAG))
return Res;

if (SDValue Res = combineVScale1Load(LD, DAG, DCI, Subtarget))
return Res;

if (!LD->isNonTemporal())
return SDValue(N, 0);

Expand Down Expand Up @@ -23949,6 +23975,29 @@ static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
return Chain;
}

// Replace packed scalable stores with fixed stores when vscale_range(1, 1).
static SDValue combineVScale1Store(StoreSDNode *ST, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
SDValue Value = ST->getValue();
EVT ValueVT = Value.getValueType();
if (!DCI.isBeforeLegalize() || !Subtarget->isLittleEndian() ||
!Subtarget->hasNEON() || !ValueVT.isScalableVector() ||
ST->isTruncatingStore() ||
ValueVT.getSizeInBits().getKnownMinValue() != 128 ||
Subtarget->getMaxSVEVectorSizeInBits() != 128)
return SDValue();

SDLoc DL(ST);
EVT NewVT =
EVT::getVectorVT(*DAG.getContext(), ValueVT.getVectorElementType(),
ValueVT.getVectorMinNumElements());
SDValue NewValue = convertFromScalableVector(DAG, NewVT, Value);
return DAG.getStore(ST->getChain(), DL, NewValue, ST->getBasePtr(),
ST->getPointerInfo(), ST->getOriginalAlign(),
ST->getMemOperand()->getFlags(), ST->getAAInfo());
}

static unsigned getFPSubregForVT(EVT VT) {
assert(VT.isSimple() && "Expected simple VT");
switch (VT.getSimpleVT().SimpleTy) {
Expand Down Expand Up @@ -23997,6 +24046,9 @@ static SDValue performSTORECombine(SDNode *N,
if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
return Res;

if (SDValue Res = combineVScale1Store(ST, DAG, DCI, Subtarget))
return Res;

// If this is an FP_ROUND followed by a store, fold this into a truncating
// store. We can do this even if this is already a truncstore.
// We purposefully don't care about legality of the nodes here as we know
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AArch64/sve-fixed-length-offsets.ll
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ define void @nxv16i8(ptr %ldptr, ptr %stptr) {
;
; CHECK-128-LABEL: nxv16i8:
; CHECK-128: // %bb.0:
; CHECK-128-NEXT: ldr z0, [x0, #16, mul vl]
; CHECK-128-NEXT: str z0, [x1, #16, mul vl]
; CHECK-128-NEXT: ldr q0, [x0, #256]
; CHECK-128-NEXT: str q0, [x1, #256]
; CHECK-128-NEXT: ret
;
; CHECK-256-LABEL: nxv16i8:
Expand Down Expand Up @@ -62,8 +62,8 @@ define void @nxv8i16(ptr %ldptr, ptr %stptr) {
;
; CHECK-128-LABEL: nxv8i16:
; CHECK-128: // %bb.0:
; CHECK-128-NEXT: ldr z0, [x0, #16, mul vl]
; CHECK-128-NEXT: str z0, [x1, #16, mul vl]
; CHECK-128-NEXT: ldr q0, [x0, #256]
; CHECK-128-NEXT: str q0, [x1, #256]
; CHECK-128-NEXT: ret
;
; CHECK-256-LABEL: nxv8i16:
Expand Down Expand Up @@ -107,8 +107,8 @@ define void @nxv4i32(ptr %ldptr, ptr %stptr) {
;
; CHECK-128-LABEL: nxv4i32:
; CHECK-128: // %bb.0:
; CHECK-128-NEXT: ldr z0, [x0, #16, mul vl]
; CHECK-128-NEXT: str z0, [x1, #16, mul vl]
; CHECK-128-NEXT: ldr q0, [x0, #256]
; CHECK-128-NEXT: str q0, [x1, #256]
; CHECK-128-NEXT: ret
;
; CHECK-256-LABEL: nxv4i32:
Expand Down Expand Up @@ -152,8 +152,8 @@ define void @nxv2i64(ptr %ldptr, ptr %stptr) {
;
; CHECK-128-LABEL: nxv2i64:
; CHECK-128: // %bb.0:
; CHECK-128-NEXT: ldr z0, [x0, #16, mul vl]
; CHECK-128-NEXT: str z0, [x1, #16, mul vl]
; CHECK-128-NEXT: ldr q0, [x0, #256]
; CHECK-128-NEXT: str q0, [x1, #256]
; CHECK-128-NEXT: ret
;
; CHECK-256-LABEL: nxv2i64:
Expand Down
239 changes: 239 additions & 0 deletions llvm/test/CodeGen/AArch64/sve-unpred-loads-stores.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-max=128 < %s | FileCheck %s --check-prefix=CHECK-128
; RUN: llc -mtriple=aarch64_be-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-max=128 < %s | FileCheck %s --check-prefix=CHECK-BE-128

define <vscale x 16 x i8> @ld_nxv16i8(ptr %0) {
; CHECK-128-LABEL: ld_nxv16i8:
; CHECK-128: // %bb.0:
; CHECK-128-NEXT: ldr q0, [x0]
; CHECK-128-NEXT: ret
;
; CHECK-BE-128-LABEL: ld_nxv16i8:
; CHECK-BE-128: // %bb.0:
; CHECK-BE-128-NEXT: ptrue p0.b
; CHECK-BE-128-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-BE-128-NEXT: ret
%2 = load <vscale x 16 x i8>, ptr %0, align 1
ret <vscale x 16 x i8> %2
}

define void @st_nxv16i8(ptr %0, <vscale x 16 x i8> %1) {
; CHECK-128-LABEL: st_nxv16i8:
; CHECK-128: // %bb.0:
; CHECK-128-NEXT: str q0, [x0]
; CHECK-128-NEXT: ret
;
; CHECK-BE-128-LABEL: st_nxv16i8:
; CHECK-BE-128: // %bb.0:
; CHECK-BE-128-NEXT: ptrue p0.b
; CHECK-BE-128-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-BE-128-NEXT: ret
store <vscale x 16 x i8> %1, ptr %0, align 1
ret void
}

define <vscale x 8 x i16> @ld_nxv8i16(ptr %0) {
; CHECK-128-LABEL: ld_nxv8i16:
; CHECK-128: // %bb.0:
; CHECK-128-NEXT: ldr q0, [x0]
; CHECK-128-NEXT: ret
;
; CHECK-BE-128-LABEL: ld_nxv8i16:
; CHECK-BE-128: // %bb.0:
; CHECK-BE-128-NEXT: ptrue p0.h
; CHECK-BE-128-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-BE-128-NEXT: ret
%2 = load <vscale x 8 x i16>, ptr %0, align 2
ret <vscale x 8 x i16> %2
}

define void @st_nxv8i16(ptr %0, <vscale x 8 x i16> %1) {
; CHECK-128-LABEL: st_nxv8i16:
; CHECK-128: // %bb.0:
; CHECK-128-NEXT: str q0, [x0]
; CHECK-128-NEXT: ret
;
; CHECK-BE-128-LABEL: st_nxv8i16:
; CHECK-BE-128: // %bb.0:
; CHECK-BE-128-NEXT: ptrue p0.h
; CHECK-BE-128-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-BE-128-NEXT: ret
store <vscale x 8 x i16> %1, ptr %0, align 2
ret void
}

define <vscale x 4 x i32> @ld_nxv4i32(ptr %0) {
; CHECK-128-LABEL: ld_nxv4i32:
; CHECK-128: // %bb.0:
; CHECK-128-NEXT: ldr q0, [x0]
; CHECK-128-NEXT: ret
;
; CHECK-BE-128-LABEL: ld_nxv4i32:
; CHECK-BE-128: // %bb.0:
; CHECK-BE-128-NEXT: ptrue p0.s
; CHECK-BE-128-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-BE-128-NEXT: ret
%2 = load <vscale x 4 x i32>, ptr %0, align 4
ret <vscale x 4 x i32> %2
}

define void @st_nxv4i32(ptr %0, <vscale x 4 x i32> %1) {
; CHECK-128-LABEL: st_nxv4i32:
; CHECK-128: // %bb.0:
; CHECK-128-NEXT: str q0, [x0]
; CHECK-128-NEXT: ret
;
; CHECK-BE-128-LABEL: st_nxv4i32:
; CHECK-BE-128: // %bb.0:
; CHECK-BE-128-NEXT: ptrue p0.s
; CHECK-BE-128-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-BE-128-NEXT: ret
store <vscale x 4 x i32> %1, ptr %0, align 4
ret void
}

define <vscale x 2 x i64> @ld_nxv2i64(ptr %0) {
; CHECK-128-LABEL: ld_nxv2i64:
; CHECK-128: // %bb.0:
; CHECK-128-NEXT: ldr q0, [x0]
; CHECK-128-NEXT: ret
;
; CHECK-BE-128-LABEL: ld_nxv2i64:
; CHECK-BE-128: // %bb.0:
; CHECK-BE-128-NEXT: ptrue p0.d
; CHECK-BE-128-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-BE-128-NEXT: ret
%2 = load <vscale x 2 x i64>, ptr %0, align 8
ret <vscale x 2 x i64> %2
}

define void @st_nxv2i64(ptr %0, <vscale x 2 x i64> %1) {
; CHECK-128-LABEL: st_nxv2i64:
; CHECK-128: // %bb.0:
; CHECK-128-NEXT: str q0, [x0]
; CHECK-128-NEXT: ret
;
; CHECK-BE-128-LABEL: st_nxv2i64:
; CHECK-BE-128: // %bb.0:
; CHECK-BE-128-NEXT: ptrue p0.d
; CHECK-BE-128-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-BE-128-NEXT: ret
store <vscale x 2 x i64> %1, ptr %0, align 8
ret void
}

define <vscale x 8 x half> @ld_nxv8f16(ptr %0) {
; CHECK-128-LABEL: ld_nxv8f16:
; CHECK-128: // %bb.0:
; CHECK-128-NEXT: ldr q0, [x0]
; CHECK-128-NEXT: ret
;
; CHECK-BE-128-LABEL: ld_nxv8f16:
; CHECK-BE-128: // %bb.0:
; CHECK-BE-128-NEXT: ptrue p0.h
; CHECK-BE-128-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-BE-128-NEXT: ret
%2 = load <vscale x 8 x half>, ptr %0, align 2
ret <vscale x 8 x half> %2
}

define void @st_nxv8f16(ptr %0, <vscale x 8 x half> %1) {
; CHECK-128-LABEL: st_nxv8f16:
; CHECK-128: // %bb.0:
; CHECK-128-NEXT: str q0, [x0]
; CHECK-128-NEXT: ret
;
; CHECK-BE-128-LABEL: st_nxv8f16:
; CHECK-BE-128: // %bb.0:
; CHECK-BE-128-NEXT: ptrue p0.h
; CHECK-BE-128-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-BE-128-NEXT: ret
store <vscale x 8 x half> %1, ptr %0, align 2
ret void
}

define <vscale x 4 x float> @ld_nxv4f32(ptr %0) {
; CHECK-128-LABEL: ld_nxv4f32:
; CHECK-128: // %bb.0:
; CHECK-128-NEXT: ldr q0, [x0]
; CHECK-128-NEXT: ret
;
; CHECK-BE-128-LABEL: ld_nxv4f32:
; CHECK-BE-128: // %bb.0:
; CHECK-BE-128-NEXT: ptrue p0.s
; CHECK-BE-128-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-BE-128-NEXT: ret
%2 = load <vscale x 4 x float>, ptr %0, align 4
ret <vscale x 4 x float> %2
}

define void @st_nxv4f32(ptr %0, <vscale x 4 x float> %1) {
; CHECK-128-LABEL: st_nxv4f32:
; CHECK-128: // %bb.0:
; CHECK-128-NEXT: str q0, [x0]
; CHECK-128-NEXT: ret
;
; CHECK-BE-128-LABEL: st_nxv4f32:
; CHECK-BE-128: // %bb.0:
; CHECK-BE-128-NEXT: ptrue p0.s
; CHECK-BE-128-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-BE-128-NEXT: ret
store <vscale x 4 x float> %1, ptr %0, align 4
ret void
}

define <vscale x 2 x double> @ld_nxv2f64(ptr %0) {
; CHECK-128-LABEL: ld_nxv2f64:
; CHECK-128: // %bb.0:
; CHECK-128-NEXT: ldr q0, [x0]
; CHECK-128-NEXT: ret
;
; CHECK-BE-128-LABEL: ld_nxv2f64:
; CHECK-BE-128: // %bb.0:
; CHECK-BE-128-NEXT: ptrue p0.d
; CHECK-BE-128-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-BE-128-NEXT: ret
%2 = load <vscale x 2 x double>, ptr %0, align 8
ret <vscale x 2 x double> %2
}

define void @st_nxv2f64(ptr %0, <vscale x 2 x double> %1) {
; CHECK-128-LABEL: st_nxv2f64:
; CHECK-128: // %bb.0:
; CHECK-128-NEXT: str q0, [x0]
; CHECK-128-NEXT: ret
;
; CHECK-BE-128-LABEL: st_nxv2f64:
; CHECK-BE-128: // %bb.0:
; CHECK-BE-128-NEXT: ptrue p0.d
; CHECK-BE-128-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-BE-128-NEXT: ret
store <vscale x 2 x double> %1, ptr %0, align 8
ret void
}

; Test LDP/STP fold.
define void @ldp_stp_nxv16i8_offset(ptr %ldptr, ptr %stptr) {
; CHECK-128-LABEL: ldp_stp_nxv16i8_offset:
; CHECK-128: // %bb.0:
; CHECK-128-NEXT: ldp q0, q1, [x0, #-16]
; CHECK-128-NEXT: stp q0, q1, [x1, #-16]
; CHECK-128-NEXT: ret
;
; CHECK-BE-128-LABEL: ldp_stp_nxv16i8_offset:
; CHECK-BE-128: // %bb.0:
; CHECK-BE-128-NEXT: ptrue p0.b
; CHECK-BE-128-NEXT: mov x8, #-16 // =0xfffffffffffffff0
; CHECK-BE-128-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
; CHECK-BE-128-NEXT: ld1b { z1.b }, p0/z, [x0]
; CHECK-BE-128-NEXT: st1b { z0.b }, p0, [x1, x8]
; CHECK-BE-128-NEXT: st1b { z1.b }, p0, [x1]
; CHECK-BE-128-NEXT: ret
%ldptr.1 = getelementptr inbounds i8, ptr %ldptr, i64 -16
%ld1 = load <vscale x 16 x i8>, ptr %ldptr.1, align 1
%ld2 = load <vscale x 16 x i8>, ptr %ldptr, align 1
%stptr.1 = getelementptr inbounds i8, ptr %stptr, i64 -16
store <vscale x 16 x i8> %ld1, ptr %stptr.1, align 1
store <vscale x 16 x i8> %ld2, ptr %stptr, align 1
ret void
}
Loading