Skip to content

[AArch64][SVE] Use SVE for scalar FP converts in streaming[-compatible] functions (1/n) #118505

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 59 additions & 1 deletion llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19083,13 +19083,67 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
return SDValue();
}

/// Tries to replace scalar FP <-> INT conversions with SVE in streaming
/// functions, this can help to reduce the number of fmovs to/from GPRs.
static SDValue
tryToReplaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
if (N->isStrictFPOpcode())
return SDValue();

if (DCI.isBeforeLegalizeOps())
return SDValue();

if (!Subtarget->isSVEorStreamingSVEAvailable() ||
(!Subtarget->isStreaming() && !Subtarget->isStreamingCompatible()))
return SDValue();

auto isSupportedType = [](EVT VT) {
return !VT.isVector() && VT != MVT::bf16 && VT != MVT::f128;
};

SDValue SrcVal = N->getOperand(0);
EVT SrcTy = SrcVal.getValueType();
EVT DestTy = N->getValueType(0);

if (!isSupportedType(SrcTy) || !isSupportedType(DestTy))
return SDValue();

EVT SrcVecTy;
EVT DestVecTy;
if (DestTy.bitsGT(SrcTy)) {
DestVecTy = getPackedSVEVectorVT(DestTy);
SrcVecTy = DestVecTy.changeVectorElementType(SrcTy);
} else {
SrcVecTy = getPackedSVEVectorVT(SrcTy);
DestVecTy = SrcVecTy.changeVectorElementType(DestTy);
}

// Ensure the resulting src/dest vector type is legal.
if (SrcVecTy == MVT::nxv2i32 || DestVecTy == MVT::nxv2i32)
return SDValue();

SDLoc DL(N);
SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SrcVecTy,
DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx);
SDValue Convert = DAG.getNode(N->getOpcode(), DL, DestVecTy, Vec);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestTy, Convert, ZeroIdx);
}

static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
// First try to optimize away the conversion when it's conditionally from
// a constant. Vectors only.
if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
return Res;

if (SDValue Res =
tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
return Res;

EVT VT = N->getValueType(0);
if (VT != MVT::f32 && VT != MVT::f64)
return SDValue();
Expand Down Expand Up @@ -19128,6 +19182,10 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
if (SDValue Res =
tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
return Res;

if (!Subtarget->isNeonAvailable())
return SDValue();

Expand Down Expand Up @@ -26208,7 +26266,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performMulCombine(N, DAG, DCI, Subtarget);
case ISD::SINT_TO_FP:
case ISD::UINT_TO_FP:
return performIntToFpCombine(N, DAG, Subtarget);
return performIntToFpCombine(N, DAG, DCI, Subtarget);
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:
case ISD::FP_TO_SINT_SAT:
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -2348,8 +2348,8 @@ let Predicates = [HasSVEorSME] in {
defm FCVT_ZPmZ_HtoD : sve_fp_2op_p_zd< 0b1101001, "fcvt", ZPR16, ZPR64, int_aarch64_sve_fcvt_f64f16, AArch64fcvte_mt, nxv2f64, nxv2i1, nxv2f16, ElementSizeD>;
defm FCVT_ZPmZ_DtoS : sve_fp_2op_p_zdr<0b1101010, "fcvt", ZPR64, ZPR32, int_aarch64_sve_fcvt_f32f64, AArch64fcvtr_mt, nxv2f32, nxv2i1, nxv2f64, ElementSizeD>;
defm FCVT_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1101011, "fcvt", ZPR32, ZPR64, int_aarch64_sve_fcvt_f64f32, AArch64fcvte_mt, nxv2f64, nxv2i1, nxv2f32, ElementSizeD>;
defm SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1110000, "scvtf", ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32, AArch64scvtf_mt, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
defm UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1110001, "ucvtf", ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32, AArch64ucvtf_mt, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
defm SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1110000, "scvtf", ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32, null_frag, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
defm UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1110001, "ucvtf", ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32, null_frag, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
defm UCVTF_ZPmZ_StoH : sve_fp_2op_p_zd< 0b0110101, "ucvtf", ZPR32, ZPR16, int_aarch64_sve_ucvtf_f16i32, AArch64ucvtf_mt, nxv4f16, nxv4i1, nxv4i32, ElementSizeS>;
defm SCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1110100, "scvtf", ZPR64, ZPR32, int_aarch64_sve_scvtf_f32i64, AArch64scvtf_mt, nxv2f32, nxv2i1, nxv2i64, ElementSizeD>;
defm SCVTF_ZPmZ_StoH : sve_fp_2op_p_zd< 0b0110100, "scvtf", ZPR32, ZPR16, int_aarch64_sve_scvtf_f16i32, AArch64scvtf_mt, nxv4f16, nxv4i1, nxv4i32, ElementSizeS>;
Expand Down
93 changes: 74 additions & 19 deletions llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
Original file line number Diff line number Diff line change
@@ -1,22 +1,32 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -force-streaming-compatible < %s | FileCheck %s
; RUN: llc -force-streaming-compatible -mattr=+sme2p2 < %s | FileCheck %s --check-prefix=USE-NEON-NO-GPRS
; RUN: llc < %s | FileCheck %s --check-prefix=USE-NEON-NO-GPRS
; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s
; RUN: llc -mattr=+sme2p2 -force-streaming-compatible < %s | FileCheck %s --check-prefix=USE-NEON-NO-GPRS
; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE

target triple = "aarch64-unknown-linux-gnu"

define double @t1(double %x) {
; CHECK-LABEL: t1:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtzs x8, d0
; CHECK-NEXT: scvtf d0, x8
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
; CHECK-NEXT: scvtf z0.d, p0/m, z0.d
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
; USE-NEON-NO-GPRS-LABEL: t1:
; USE-NEON-NO-GPRS: // %bb.0: // %entry
; USE-NEON-NO-GPRS-NEXT: fcvtzs d0, d0
; USE-NEON-NO-GPRS-NEXT: scvtf d0, d0
; USE-NEON-NO-GPRS-NEXT: ret
;
; NONEON-NOSVE-LABEL: t1:
; NONEON-NOSVE: // %bb.0: // %entry
; NONEON-NOSVE-NEXT: fcvtzs x8, d0
; NONEON-NOSVE-NEXT: scvtf d0, x8
; NONEON-NOSVE-NEXT: ret
entry:
%conv = fptosi double %x to i64
%conv1 = sitofp i64 %conv to double
Expand All @@ -26,15 +36,24 @@ entry:
define float @t2(float %x) {
; CHECK-LABEL: t2:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtzs w8, s0
; CHECK-NEXT: scvtf s0, w8
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s
; CHECK-NEXT: scvtf z0.s, p0/m, z0.s
; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
; CHECK-NEXT: ret
;
; USE-NEON-NO-GPRS-LABEL: t2:
; USE-NEON-NO-GPRS: // %bb.0: // %entry
; USE-NEON-NO-GPRS-NEXT: fcvtzs s0, s0
; USE-NEON-NO-GPRS-NEXT: scvtf s0, s0
; USE-NEON-NO-GPRS-NEXT: ret
;
; NONEON-NOSVE-LABEL: t2:
; NONEON-NOSVE: // %bb.0: // %entry
; NONEON-NOSVE-NEXT: fcvtzs w8, s0
; NONEON-NOSVE-NEXT: scvtf s0, w8
; NONEON-NOSVE-NEXT: ret
entry:
%conv = fptosi float %x to i32
%conv1 = sitofp i32 %conv to float
Expand All @@ -44,11 +63,20 @@ entry:
define half @t3(half %x) {
; CHECK-LABEL: t3:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvt s0, h0
; CHECK-NEXT: fcvtzs w8, s0
; CHECK-NEXT: scvtf s0, w8
; CHECK-NEXT: fcvt h0, s0
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.h
; CHECK-NEXT: scvtf z0.h, p0/m, z0.s
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: t3:
; NONEON-NOSVE: // %bb.0: // %entry
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: fcvtzs w8, s0
; NONEON-NOSVE-NEXT: scvtf s0, w8
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: ret
entry:
%conv = fptosi half %x to i32
%conv1 = sitofp i32 %conv to half
Expand All @@ -58,15 +86,24 @@ entry:
define double @t4(double %x) {
; CHECK-LABEL: t4:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtzu x8, d0
; CHECK-NEXT: ucvtf d0, x8
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d
; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
; USE-NEON-NO-GPRS-LABEL: t4:
; USE-NEON-NO-GPRS: // %bb.0: // %entry
; USE-NEON-NO-GPRS-NEXT: fcvtzu d0, d0
; USE-NEON-NO-GPRS-NEXT: ucvtf d0, d0
; USE-NEON-NO-GPRS-NEXT: ret
;
; NONEON-NOSVE-LABEL: t4:
; NONEON-NOSVE: // %bb.0: // %entry
; NONEON-NOSVE-NEXT: fcvtzu x8, d0
; NONEON-NOSVE-NEXT: ucvtf d0, x8
; NONEON-NOSVE-NEXT: ret
entry:
%conv = fptoui double %x to i64
%conv1 = uitofp i64 %conv to double
Expand All @@ -76,15 +113,24 @@ entry:
define float @t5(float %x) {
; CHECK-LABEL: t5:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtzu w8, s0
; CHECK-NEXT: ucvtf s0, w8
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s
; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s
; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
; CHECK-NEXT: ret
;
; USE-NEON-NO-GPRS-LABEL: t5:
; USE-NEON-NO-GPRS: // %bb.0: // %entry
; USE-NEON-NO-GPRS-NEXT: fcvtzu s0, s0
; USE-NEON-NO-GPRS-NEXT: ucvtf s0, s0
; USE-NEON-NO-GPRS-NEXT: ret
;
; NONEON-NOSVE-LABEL: t5:
; NONEON-NOSVE: // %bb.0: // %entry
; NONEON-NOSVE-NEXT: fcvtzu w8, s0
; NONEON-NOSVE-NEXT: ucvtf s0, w8
; NONEON-NOSVE-NEXT: ret
entry:
%conv = fptoui float %x to i32
%conv1 = uitofp i32 %conv to float
Expand All @@ -94,11 +140,20 @@ entry:
define half @t6(half %x) {
; CHECK-LABEL: t6:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvt s0, h0
; CHECK-NEXT: fcvtzu w8, s0
; CHECK-NEXT: ucvtf s0, w8
; CHECK-NEXT: fcvt h0, s0
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.h
; CHECK-NEXT: ucvtf z0.h, p0/m, z0.s
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: t6:
; NONEON-NOSVE: // %bb.0: // %entry
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: fcvtzu w8, s0
; NONEON-NOSVE-NEXT: ucvtf s0, w8
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: ret
entry:
%conv = fptoui half %x to i32
%conv1 = uitofp i32 %conv to half
Expand Down
Loading
Loading