diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 37fcd09d4f562..fa054cac84f1c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -508,6 +508,7 @@ namespace { SDValue visitFSQRT(SDNode *N); SDValue visitFCOPYSIGN(SDNode *N); SDValue visitFPOW(SDNode *N); + SDValue visitFCANONICALIZE(SDNode *N); SDValue visitSINT_TO_FP(SDNode *N); SDValue visitUINT_TO_FP(SDNode *N); SDValue visitFP_TO_SINT(SDNode *N); @@ -1980,6 +1981,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::FREEZE: return visitFREEZE(N); case ISD::GET_FPENV_MEM: return visitGET_FPENV_MEM(N); case ISD::SET_FPENV_MEM: return visitSET_FPENV_MEM(N); + case ISD::FCANONICALIZE: return visitFCANONICALIZE(N); case ISD::VECREDUCE_FADD: case ISD::VECREDUCE_FMUL: case ISD::VECREDUCE_ADD: @@ -2090,6 +2092,19 @@ static SDValue getInputChainForNode(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitFCANONICALIZE(SDNode *N) { + SDValue Operand = N->getOperand(0); + EVT VT = Operand.getValueType(); + SDLoc dl(N); + + // Canonicalize undef to quiet NaN. + if (Operand.isUndef()) { + APFloat CanonicalQNaN = APFloat::getQNaN(VT.getFltSemantics()); + return DAG.getConstantFP(CanonicalQNaN, dl, VT); + } + return SDValue(); +} + SDValue DAGCombiner::visitTokenFactor(SDNode *N) { // If N has two operands, where one has an input chain equal to the other, // the 'other' chain is redundant. diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 182f6c08366a9..0a3fa7a4a8fda 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -331,9 +331,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom); setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom); } + setOperationAction(ISD::FCANONICALIZE, MVT::f32, Custom); if (Subtarget.is64Bit()) { setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom); + setOperationAction(ISD::FCANONICALIZE, MVT::f64, Custom); } } if (Subtarget.hasAVX10_2()) { @@ -353,6 +355,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (!Subtarget.hasSSE2()) { setOperationAction(ISD::BITCAST , MVT::f32 , Expand); setOperationAction(ISD::BITCAST , MVT::i32 , Expand); + setOperationAction(ISD::FCANONICALIZE, MVT::f32, Custom); + setOperationAction(ISD::FCANONICALIZE, MVT::f80, Custom); + setOperationAction(ISD::FCANONICALIZE, MVT::f64, Custom); if (Subtarget.is64Bit()) { setOperationAction(ISD::BITCAST , MVT::f64 , Expand); // Without SSE, i64->f64 goes through memory. @@ -716,6 +721,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Promote); setOperationAction(ISD::STRICT_FTRUNC, MVT::f16, Promote); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); + setOperationAction(ISD::FCANONICALIZE, MVT::f16, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom); @@ -932,6 +938,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (isTypeLegal(MVT::f80)) { setOperationAction(ISD::FP_ROUND, MVT::f80, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom); + setOperationAction(ISD::FCANONICALIZE, MVT::f80, Custom); } setOperationAction(ISD::SETCC, MVT::f128, Custom); @@ -1065,9 +1072,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::SELECT, MVT::v4f32, Custom); + setOperationAction(ISD::FCANONICALIZE, MVT::v4f32, Custom); setOperationAction(ISD::LOAD, MVT::v2f32, Custom); setOperationAction(ISD::STORE, MVT::v2f32, Custom); + setOperationAction(ISD::FCANONICALIZE, MVT::v2f32, Custom); setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal); setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal); @@ -1128,6 +1137,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UMULO, MVT::v2i32, Custom); setOperationAction(ISD::FNEG, MVT::v2f64, Custom); + setOperationAction(ISD::FCANONICALIZE, MVT::v2f64, Custom); setOperationAction(ISD::FABS, MVT::v2f64, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom); @@ -1460,6 +1470,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FMAXIMUM, VT, Custom); setOperationAction(ISD::FMINIMUM, VT, Custom); + setOperationAction(ISD::FCANONICALIZE, VT, Custom); } setOperationAction(ISD::LRINT, MVT::v8f32, Custom); @@ -1725,6 +1736,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom); + setOperationAction(ISD::FCANONICALIZE, MVT::v8f16, Custom); + setOperationAction(ISD::FCANONICALIZE, MVT::v16f16, Custom); + setOperationAction(ISD::FCANONICALIZE, MVT::v32f16, Custom); // There is no byte sized k-register load or store without AVX512DQ. if (!Subtarget.hasDQI()) { @@ -1804,6 +1818,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FMA, VT, Legal); setOperationAction(ISD::STRICT_FMA, VT, Legal); setOperationAction(ISD::FCOPYSIGN, VT, Custom); + setOperationAction(ISD::FCANONICALIZE, VT, Custom); } setOperationAction(ISD::LRINT, MVT::v16f32, Subtarget.hasDQI() ? Legal : Custom); @@ -32664,6 +32679,24 @@ static SDValue LowerPREFETCH(SDValue Op, const X86Subtarget &Subtarget, return Op; } +static SDValue LowerFCanonicalize(SDValue Op, SelectionDAG &DAG) { + SDNode *N = Op.getNode(); + SDValue Operand = N->getOperand(0); + EVT VT = Operand.getValueType(); + SDLoc dl(N); + + SDValue One = DAG.getConstantFP(1.0, dl, VT); + + // TODO: Fix Crash for bf16 when generating strict_fmul as it + // leads to a error : SoftPromoteHalfResult #0: t11: bf16,ch = strict_fmul t0, + // ConstantFP:bf16, t5 LLVM ERROR: Do not know how to soft + // promote this operator's result! + SDValue Chain = DAG.getEntryNode(); + SDValue StrictFmul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other}, + {Chain, Operand, One}); + return StrictFmul; +} + static StringRef getInstrStrFromOpNo(const SmallVectorImpl &AsmStrs, unsigned OpNo) { const APInt Operand(32, OpNo); @@ -32803,6 +32836,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); case ISD::FSHL: case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG); + case ISD::FCANONICALIZE: return LowerFCanonicalize(Op, DAG); case ISD::STRICT_SINT_TO_FP: case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::STRICT_UINT_TO_FP: diff --git a/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll new file mode 100644 index 0000000000000..52048a0a2065b --- /dev/null +++ b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll @@ -0,0 +1,415 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --default-march x86_64-unknown-linux-gnu --version 5 +; RUN: llc -mattr=+sse2 -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=SSE +; RUN: llc -mattr=+avx -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX1 +; RUN: llc -mattr=+avx2 -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX2 +; RUN: llc -mattr=+avx512f -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX512F +; RUN: llc -mattr=+avx512bw -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX512BW + +define void @v_test_canonicalize__half(half addrspace(1)* %out) nounwind { +; SSE-LABEL: v_test_canonicalize__half: +; SSE: # %bb.0: # %entry +; SSE-NEXT: pushq %rbx +; SSE-NEXT: subq $16, %rsp +; SSE-NEXT: movq %rdi, %rbx +; SSE-NEXT: pinsrw $0, (%rdi), %xmm0 +; SSE-NEXT: callq __extendhfsf2@PLT +; SSE-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE-NEXT: pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: callq __extendhfsf2@PLT +; SSE-NEXT: mulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE-NEXT: callq __truncsfhf2@PLT +; SSE-NEXT: pextrw $0, %xmm0, %eax +; SSE-NEXT: movw %ax, (%rbx) +; SSE-NEXT: addq $16, %rsp +; SSE-NEXT: popq %rbx +; SSE-NEXT: retq +; +; AVX1-LABEL: v_test_canonicalize__half: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: subq $16, %rsp +; AVX1-NEXT: movq %rdi, %rbx +; AVX1-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; AVX1-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX1-NEXT: callq __truncsfhf2@PLT +; AVX1-NEXT: vpextrw $0, %xmm0, (%rbx) +; AVX1-NEXT: addq $16, %rsp +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: retq +; +; AVX2-LABEL: v_test_canonicalize__half: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: subq $16, %rsp +; AVX2-NEXT: movq %rdi, %rbx +; AVX2-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; AVX2-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX2-NEXT: callq __truncsfhf2@PLT +; AVX2-NEXT: vpextrw $0, %xmm0, (%rbx) +; AVX2-NEXT: addq $16, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: retq +; +; AVX512F-LABEL: v_test_canonicalize__half: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: movzwl (%rdi), %eax +; AVX512F-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %eax, %xmm1 +; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512F-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: movw %ax, (%rdi) +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v_test_canonicalize__half: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: movzwl (%rdi), %eax +; AVX512BW-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx +; AVX512BW-NEXT: vmovd %ecx, %xmm0 +; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %eax, %xmm1 +; AVX512BW-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512BW-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX512BW-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: movw %ax, (%rdi) +; AVX512BW-NEXT: retq +entry: + %val = load half, half addrspace(1)* %out + %canonicalized = call half @llvm.canonicalize.f16(half %val) + store half %canonicalized, half addrspace(1)* %out + ret void +} + +define half @complex_canonicalize_fmul_half(half %a, half %b) nounwind { +; SSE-LABEL: complex_canonicalize_fmul_half: +; SSE: # %bb.0: # %entry +; SSE-NEXT: pushq %rax +; SSE-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: callq __extendhfsf2@PLT +; SSE-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: callq __extendhfsf2@PLT +; SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: movss (%rsp), %xmm1 # 4-byte Reload +; SSE-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: subss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: callq __truncsfhf2@PLT +; SSE-NEXT: callq __extendhfsf2@PLT +; SSE-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE-NEXT: callq __truncsfhf2@PLT +; SSE-NEXT: callq __extendhfsf2@PLT +; SSE-NEXT: subss (%rsp), %xmm0 # 4-byte Folded Reload +; SSE-NEXT: callq __truncsfhf2@PLT +; SSE-NEXT: callq __extendhfsf2@PLT +; SSE-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE-NEXT: pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: callq __extendhfsf2@PLT +; SSE-NEXT: mulss (%rsp), %xmm0 # 4-byte Folded Reload +; SSE-NEXT: callq __truncsfhf2@PLT +; SSE-NEXT: callq __extendhfsf2@PLT +; SSE-NEXT: subss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE-NEXT: callq __truncsfhf2@PLT +; SSE-NEXT: popq %rax +; SSE-NEXT: retq +; +; AVX1-LABEL: complex_canonicalize_fmul_half: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: pushq %rax +; AVX1-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill +; AVX1-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; AVX1-NEXT: # xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vmovss (%rsp), %xmm1 # 4-byte Reload +; AVX1-NEXT: # xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vsubss %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: callq __truncsfhf2@PLT +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill +; AVX1-NEXT: vaddss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX1-NEXT: callq __truncsfhf2@PLT +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vsubss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX1-NEXT: callq __truncsfhf2@PLT +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill +; AVX1-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmulss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX1-NEXT: callq __truncsfhf2@PLT +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX1-NEXT: callq __truncsfhf2@PLT +; AVX1-NEXT: popq %rax +; AVX1-NEXT: retq +; +; AVX2-LABEL: complex_canonicalize_fmul_half: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: pushq %rax +; AVX2-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill +; AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vmovss (%rsp), %xmm1 # 4-byte Reload +; AVX2-NEXT: # xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vsubss %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: callq __truncsfhf2@PLT +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill +; AVX2-NEXT: vaddss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX2-NEXT: callq __truncsfhf2@PLT +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vsubss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX2-NEXT: callq __truncsfhf2@PLT +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill +; AVX2-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmulss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX2-NEXT: callq __truncsfhf2@PLT +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX2-NEXT: callq __truncsfhf2@PLT +; AVX2-NEXT: popq %rax +; AVX2-NEXT: retq +; +; AVX512F-LABEL: complex_canonicalize_fmul_half: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vpextrw $0, %xmm1, %eax +; AVX512F-NEXT: vpextrw $0, %xmm0, %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %eax, %xmm1 +; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512F-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: vaddss %xmm1, %xmm0, %xmm2 +; AVX512F-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512F-NEXT: vsubss %xmm0, %xmm2, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax +; AVX512F-NEXT: vmovd %eax, %xmm2 +; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512F-NEXT: vmulss %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: complex_canonicalize_fmul_half: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: vpextrw $0, %xmm1, %eax +; AVX512BW-NEXT: vpextrw $0, %xmm0, %ecx +; AVX512BW-NEXT: vmovd %ecx, %xmm0 +; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %eax, %xmm1 +; AVX512BW-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512BW-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512BW-NEXT: vaddss %xmm1, %xmm0, %xmm2 +; AVX512BW-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX512BW-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512BW-NEXT: vsubss %xmm0, %xmm2, %xmm0 +; AVX512BW-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512BW-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax +; AVX512BW-NEXT: vmovd %eax, %xmm2 +; AVX512BW-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512BW-NEXT: vmulss %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512BW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; AVX512BW-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512BW-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: retq +entry: + + %mul1 = fsub half %a, %b + %add = fadd half %mul1, %b + %mul2 = fsub half %add, %mul1 + %canonicalized = call half @llvm.canonicalize.f16(half %mul2) + %result = fsub half %canonicalized, %b + ret half %result +} + +define void @v_test_canonicalize_v2half(<2 x half> addrspace(1)* %out) nounwind { +; SSE-LABEL: v_test_canonicalize_v2half: +; SSE: # %bb.0: # %entry +; SSE-NEXT: pushq %rbx +; SSE-NEXT: subq $48, %rsp +; SSE-NEXT: movq %rdi, %rbx +; SSE-NEXT: pinsrw $0, 2(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pinsrw $0, (%rdi), %xmm0 +; SSE-NEXT: callq __extendhfsf2@PLT +; SSE-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE-NEXT: pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: callq __extendhfsf2@PLT +; SSE-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSE-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: mulss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: callq __truncsfhf2@PLT +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: callq __extendhfsf2@PLT +; SSE-NEXT: mulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE-NEXT: callq __truncsfhf2@PLT +; SSE-NEXT: pextrw $0, %xmm0, %eax +; SSE-NEXT: movw %ax, 2(%rbx) +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pextrw $0, %xmm0, %eax +; SSE-NEXT: movw %ax, (%rbx) +; SSE-NEXT: addq $48, %rsp +; SSE-NEXT: popq %rbx +; SSE-NEXT: retq +; +; AVX1-LABEL: v_test_canonicalize_v2half: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: subq $48, %rsp +; AVX1-NEXT: movq %rdi, %rbx +; AVX1-NEXT: vpinsrw $0, 2(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; AVX1-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; AVX1-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX1-NEXT: callq __truncsfhf2@PLT +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX1-NEXT: callq __truncsfhf2@PLT +; AVX1-NEXT: vpextrw $0, %xmm0, 2(%rbx) +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vpextrw $0, %xmm0, (%rbx) +; AVX1-NEXT: addq $48, %rsp +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: retq +; +; AVX2-LABEL: v_test_canonicalize_v2half: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: subq $48, %rsp +; AVX2-NEXT: movq %rdi, %rbx +; AVX2-NEXT: vpinsrw $0, 2(%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; AVX2-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; AVX2-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX2-NEXT: callq __truncsfhf2@PLT +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX2-NEXT: callq __truncsfhf2@PLT +; AVX2-NEXT: vpextrw $0, %xmm0, 2(%rbx) +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vpextrw $0, %xmm0, (%rbx) +; AVX2-NEXT: addq $48, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: retq +; +; AVX512F-LABEL: v_test_canonicalize_v2half: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512F-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax +; AVX512F-NEXT: vmovd %eax, %xmm1 +; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512F-NEXT: vmulss %xmm1, %xmm2, %xmm2 +; AVX512F-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3] +; AVX512F-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX512F-NEXT: vmovd %xmm2, %eax +; AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3] +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512F-NEXT: vmovd %xmm0, (%rdi) +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v_test_canonicalize_v2half: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512BW-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax +; AVX512BW-NEXT: vmovd %eax, %xmm1 +; AVX512BW-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512BW-NEXT: vmulss %xmm1, %xmm2, %xmm2 +; AVX512BW-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512BW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3] +; AVX512BW-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovd %xmm2, %eax +; AVX512BW-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2 +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512BW-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3] +; AVX512BW-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512BW-NEXT: vmovd %xmm0, (%rdi) +; AVX512BW-NEXT: retq +entry: + %val = load <2 x half>, <2 x half> addrspace(1)* %out + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val) + store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out + ret void +} + diff --git a/llvm/test/CodeGen/X86/canonicalize-vars.ll b/llvm/test/CodeGen/X86/canonicalize-vars.ll new file mode 100644 index 0000000000000..13ea53389411b --- /dev/null +++ b/llvm/test/CodeGen/X86/canonicalize-vars.ll @@ -0,0 +1,636 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --default-march x86_64-unknown-linux-gnu --version 5 +; RUN: llc -mtriple=i686-- --mattr=-sse2 < %s | FileCheck %s -check-prefixes=SSE1 +; RUN: llc -mattr=+sse2 -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=SSE2 +; RUN: llc -mattr=+avx -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX1 +; RUN: llc -mattr=+avx2 -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX1,AVX2 +; RUN: llc -mattr=+avx512f -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX1,AVX512F + +define float @canon_fp32_varargsf32(float %a) { +; SSE1-LABEL: canon_fp32_varargsf32: +; SSE1: # %bb.0: +; SSE1-NEXT: fld1 +; SSE1-NEXT: fmuls {{[0-9]+}}(%esp) +; SSE1-NEXT: retl +; +; SSE2-LABEL: canon_fp32_varargsf32: +; SSE2: # %bb.0: +; SSE2-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: retq +; +; AVX1-LABEL: canon_fp32_varargsf32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq + + %canonicalized = call float @llvm.canonicalize.f32(float %a) + ret float %canonicalized +} + +define x86_fp80 @canon_fp32_varargsf80(x86_fp80 %a) { +; SSE1-LABEL: canon_fp32_varargsf80: +; SSE1: # %bb.0: +; SSE1-NEXT: fldt {{[0-9]+}}(%esp) +; SSE1-NEXT: fld1 +; SSE1-NEXT: fmulp %st, %st(1) +; SSE1-NEXT: retl +; +; SSE2-LABEL: canon_fp32_varargsf80: +; SSE2: # %bb.0: +; SSE2-NEXT: fldt {{[0-9]+}}(%rsp) +; SSE2-NEXT: fld1 +; SSE2-NEXT: fmulp %st, %st(1) +; SSE2-NEXT: retq +; +; AVX1-LABEL: canon_fp32_varargsf80: +; AVX1: # %bb.0: +; AVX1-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX1-NEXT: fld1 +; AVX1-NEXT: fmulp %st, %st(1) +; AVX1-NEXT: retq + %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 %a) + ret x86_fp80 %canonicalized +} + +define x86_fp80 @complex_canonicalize_fmul_x86_fp80(x86_fp80 %a, x86_fp80 %b) { +; SSE1-LABEL: complex_canonicalize_fmul_x86_fp80: +; SSE1: # %bb.0: # %entry +; SSE1-NEXT: fldt {{[0-9]+}}(%esp) +; SSE1-NEXT: fldt {{[0-9]+}}(%esp) +; SSE1-NEXT: fsub %st(1), %st +; SSE1-NEXT: fld %st(0) +; SSE1-NEXT: fadd %st(2), %st +; SSE1-NEXT: fsubp %st, %st(1) +; SSE1-NEXT: fld1 +; SSE1-NEXT: fmulp %st, %st(1) +; SSE1-NEXT: fsubp %st, %st(1) +; SSE1-NEXT: retl +; +; SSE2-LABEL: complex_canonicalize_fmul_x86_fp80: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: fldt {{[0-9]+}}(%rsp) +; SSE2-NEXT: fldt {{[0-9]+}}(%rsp) +; SSE2-NEXT: fsub %st(1), %st +; SSE2-NEXT: fld %st(0) +; SSE2-NEXT: fadd %st(2), %st +; SSE2-NEXT: fsubp %st, %st(1) +; SSE2-NEXT: fld1 +; SSE2-NEXT: fmulp %st, %st(1) +; SSE2-NEXT: fsubp %st, %st(1) +; SSE2-NEXT: retq +; +; AVX1-LABEL: complex_canonicalize_fmul_x86_fp80: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX1-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX1-NEXT: fsub %st(1), %st +; AVX1-NEXT: fld %st(0) +; AVX1-NEXT: fadd %st(2), %st +; AVX1-NEXT: fsubp %st, %st(1) +; AVX1-NEXT: fld1 +; AVX1-NEXT: fmulp %st, %st(1) +; AVX1-NEXT: fsubp %st, %st(1) +; AVX1-NEXT: retq +entry: + + %mul1 = fsub x86_fp80 %a, %b + %add = fadd x86_fp80 %mul1, %b + %mul2 = fsub x86_fp80 %add, %mul1 + %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 %mul2) + %result = fsub x86_fp80 %canonicalized, %b + ret x86_fp80 %result +} + +define double @canonicalize_fp64(double %a, double %b) unnamed_addr #0 { +; SSE1-LABEL: canonicalize_fp64: +; SSE1: # %bb.0: # %start +; SSE1-NEXT: fldl {{[0-9]+}}(%esp) +; SSE1-NEXT: fldl {{[0-9]+}}(%esp) +; SSE1-NEXT: fucom %st(1) +; SSE1-NEXT: fnstsw %ax +; SSE1-NEXT: # kill: def $ah killed $ah killed $ax +; SSE1-NEXT: sahf +; SSE1-NEXT: fxch %st(1) +; SSE1-NEXT: fucom %st(0) +; SSE1-NEXT: fnstsw %ax +; SSE1-NEXT: fld %st(1) +; SSE1-NEXT: ja .LBB3_2 +; SSE1-NEXT: # %bb.1: # %start +; SSE1-NEXT: fstp %st(0) +; SSE1-NEXT: fldz +; SSE1-NEXT: fxch %st(1) +; SSE1-NEXT: .LBB3_2: # %start +; SSE1-NEXT: fstp %st(1) +; SSE1-NEXT: # kill: def $ah killed $ah killed $ax +; SSE1-NEXT: sahf +; SSE1-NEXT: jp .LBB3_4 +; SSE1-NEXT: # %bb.3: # %start +; SSE1-NEXT: fstp %st(1) +; SSE1-NEXT: fldz +; SSE1-NEXT: .LBB3_4: # %start +; SSE1-NEXT: fstp %st(0) +; SSE1-NEXT: fld1 +; SSE1-NEXT: fmulp %st, %st(1) +; SSE1-NEXT: retl +; +; SSE2-LABEL: canonicalize_fp64: +; SSE2: # %bb.0: # %start +; SSE2-NEXT: movapd %xmm0, %xmm2 +; SSE2-NEXT: cmpunordsd %xmm0, %xmm2 +; SSE2-NEXT: movapd %xmm2, %xmm3 +; SSE2-NEXT: andpd %xmm1, %xmm3 +; SSE2-NEXT: maxsd %xmm0, %xmm1 +; SSE2-NEXT: andnpd %xmm1, %xmm2 +; SSE2-NEXT: orpd %xmm3, %xmm2 +; SSE2-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: movapd %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: canonicalize_fp64: +; AVX2: # %bb.0: # %start +; AVX2-NEXT: vmaxsd %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 +; AVX2-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: canonicalize_fp64: +; AVX512F: # %bb.0: # %start +; AVX512F-NEXT: vmaxsd %xmm0, %xmm1, %xmm2 +; AVX512F-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512F-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1} +; AVX512F-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 +; AVX512F-NEXT: retq +start: + + %c = fcmp olt double %a, %b + %d = fcmp uno double %a, 0.000000e+00 + %or.cond.i.i = or i1 %d, %c + %e = select i1 %or.cond.i.i, double %b, double %a + %f = tail call double @llvm.canonicalize.f64(double %e) #2 + ret double %f +} + +define float @canonicalize_fp32(float %aa, float %bb) unnamed_addr #0 { +; SSE1-LABEL: canonicalize_fp32: +; SSE1: # %bb.0: # %start +; SSE1-NEXT: flds {{[0-9]+}}(%esp) +; SSE1-NEXT: flds {{[0-9]+}}(%esp) +; SSE1-NEXT: fucom %st(1) +; SSE1-NEXT: fnstsw %ax +; SSE1-NEXT: # kill: def $ah killed $ah killed $ax +; SSE1-NEXT: sahf +; SSE1-NEXT: fxch %st(1) +; SSE1-NEXT: fucom %st(0) +; SSE1-NEXT: fnstsw %ax +; SSE1-NEXT: fld %st(1) +; SSE1-NEXT: ja .LBB4_2 +; SSE1-NEXT: # %bb.1: # %start +; SSE1-NEXT: fstp %st(0) +; SSE1-NEXT: fldz +; SSE1-NEXT: fxch %st(1) +; SSE1-NEXT: .LBB4_2: # %start +; SSE1-NEXT: fstp %st(1) +; SSE1-NEXT: # kill: def $ah killed $ah killed $ax +; SSE1-NEXT: sahf +; SSE1-NEXT: jp .LBB4_4 +; SSE1-NEXT: # %bb.3: # %start +; SSE1-NEXT: fstp %st(1) +; SSE1-NEXT: fldz +; SSE1-NEXT: .LBB4_4: # %start +; SSE1-NEXT: fstp %st(0) +; SSE1-NEXT: fld1 +; SSE1-NEXT: fmulp %st, %st(1) +; SSE1-NEXT: retl +; +; SSE2-LABEL: canonicalize_fp32: +; SSE2: # %bb.0: # %start +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: cmpunordss %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm1, %xmm3 +; SSE2-NEXT: maxss %xmm0, %xmm1 +; SSE2-NEXT: andnps %xmm1, %xmm2 +; SSE2-NEXT: orps %xmm3, %xmm2 +; SSE2-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: canonicalize_fp32: +; AVX2: # %bb.0: # %start +; AVX2-NEXT: vmaxss %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: canonicalize_fp32: +; AVX512F: # %bb.0: # %start +; AVX512F-NEXT: vmaxss %xmm0, %xmm1, %xmm2 +; AVX512F-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512F-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512F-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 +; AVX512F-NEXT: retq +start: + + %cc = fcmp olt float %aa, %bb + %dd = fcmp uno float %aa, 0.000000e+00 + %or.cond.i.i.x = or i1 %dd, %cc + %ee = select i1 %or.cond.i.i.x, float %bb, float %aa + %ff = tail call float @llvm.canonicalize.f32(float %ee) #2 + ret float %ff +} + +define void @v_test_canonicalize_var_f32(float addrspace(1)* %out) #1 { +; SSE1-LABEL: v_test_canonicalize_var_f32: +; SSE1: # %bb.0: +; SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE1-NEXT: fld1 +; SSE1-NEXT: fmuls (%eax) +; SSE1-NEXT: fstps (%eax) +; SSE1-NEXT: retl +; +; SSE2-LABEL: v_test_canonicalize_var_f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movss %xmm0, (%rdi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: v_test_canonicalize_var_f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovss %xmm0, (%rdi) +; AVX1-NEXT: retq + %val = load float, float addrspace(1)* %out + %canonicalized = call float @llvm.canonicalize.f32(float %val) + store float %canonicalized, float addrspace(1)* %out + ret void +} + +define void @v_test_canonicalize_x86_fp80(x86_fp80 addrspace(1)* %out) #1 { +; SSE1-LABEL: v_test_canonicalize_x86_fp80: +; SSE1: # %bb.0: +; SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE1-NEXT: fldt (%eax) +; SSE1-NEXT: fld1 +; SSE1-NEXT: fmulp %st, %st(1) +; SSE1-NEXT: fstpt (%eax) +; SSE1-NEXT: retl +; +; SSE2-LABEL: v_test_canonicalize_x86_fp80: +; SSE2: # %bb.0: +; SSE2-NEXT: fldt (%rdi) +; SSE2-NEXT: fld1 +; SSE2-NEXT: fmulp %st, %st(1) +; SSE2-NEXT: fstpt (%rdi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: v_test_canonicalize_x86_fp80: +; AVX1: # %bb.0: +; AVX1-NEXT: fldt (%rdi) +; AVX1-NEXT: fld1 +; AVX1-NEXT: fmulp %st, %st(1) +; AVX1-NEXT: fstpt (%rdi) +; AVX1-NEXT: retq + + %val = load x86_fp80, x86_fp80 addrspace(1)* %out + %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 %val) + store x86_fp80 %canonicalized, x86_fp80 addrspace(1)* %out + ret void +} + +define void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 { +; SSE1-LABEL: v_test_canonicalize_var_f64: +; SSE1: # %bb.0: +; SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE1-NEXT: fld1 +; SSE1-NEXT: fmull (%eax) +; SSE1-NEXT: fstpl (%eax) +; SSE1-NEXT: retl +; +; SSE2-LABEL: v_test_canonicalize_var_f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movsd %xmm0, (%rdi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: v_test_canonicalize_var_f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovsd %xmm0, (%rdi) +; AVX1-NEXT: retq + + %val = load double, double addrspace(1)* %out + %canonicalized = call double @llvm.canonicalize.f64(double %val) + store double %canonicalized, double addrspace(1)* %out + ret void +} + +define void @canonicalize_undef(double addrspace(1)* %out) { +; SSE1-LABEL: canonicalize_undef: +; SSE1: # %bb.0: +; SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE1-NEXT: movl $2146959360, 4(%eax) # imm = 0x7FF80000 +; SSE1-NEXT: movl $0, (%eax) +; SSE1-NEXT: retl +; +; SSE2-LABEL: canonicalize_undef: +; SSE2: # %bb.0: +; SSE2-NEXT: movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000 +; SSE2-NEXT: movq %rax, (%rdi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: canonicalize_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000 +; AVX1-NEXT: movq %rax, (%rdi) +; AVX1-NEXT: retq + + %canonicalized = call double @llvm.canonicalize.f64(double undef) + store double %canonicalized, double addrspace(1)* %out + ret void +} + +define <4 x float> @canon_fp32_varargsv4f32(<4 x float> %a) { +; SSE1-LABEL: canon_fp32_varargsv4f32: +; SSE1: # %bb.0: +; SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE1-NEXT: fld1 +; SSE1-NEXT: fld %st(0) +; SSE1-NEXT: fmuls {{[0-9]+}}(%esp) +; SSE1-NEXT: fld %st(1) +; SSE1-NEXT: fmuls {{[0-9]+}}(%esp) +; SSE1-NEXT: fld %st(2) +; SSE1-NEXT: fmuls {{[0-9]+}}(%esp) +; SSE1-NEXT: fxch %st(3) +; SSE1-NEXT: fmuls {{[0-9]+}}(%esp) +; SSE1-NEXT: fstps 12(%eax) +; SSE1-NEXT: fxch %st(2) +; SSE1-NEXT: fstps 8(%eax) +; SSE1-NEXT: fxch %st(1) +; SSE1-NEXT: fstps 4(%eax) +; SSE1-NEXT: fstps (%eax) +; SSE1-NEXT: retl $4 +; +; SSE2-LABEL: canon_fp32_varargsv4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: canon_fp32_varargsv4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX2-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: canon_fp32_varargsv4f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX512F-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: retq + %canonicalized = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> %a) + ret <4 x float> %canonicalized +} + +define <4 x double> @canon_fp64_varargsv4f64(<4 x double> %a) { +; SSE1-LABEL: canon_fp64_varargsv4f64: +; SSE1: # %bb.0: +; SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE1-NEXT: fld1 +; SSE1-NEXT: fld %st(0) +; SSE1-NEXT: fmull {{[0-9]+}}(%esp) +; SSE1-NEXT: fld %st(1) +; SSE1-NEXT: fmull {{[0-9]+}}(%esp) +; SSE1-NEXT: fld %st(2) +; SSE1-NEXT: fmull {{[0-9]+}}(%esp) +; SSE1-NEXT: fxch %st(3) +; SSE1-NEXT: fmull {{[0-9]+}}(%esp) +; SSE1-NEXT: fstpl 24(%eax) +; SSE1-NEXT: fxch %st(2) +; SSE1-NEXT: fstpl 16(%eax) +; SSE1-NEXT: fxch %st(1) +; SSE1-NEXT: fstpl 8(%eax) +; SSE1-NEXT: fstpl (%eax) +; SSE1-NEXT: retl $4 +; +; SSE2-LABEL: canon_fp64_varargsv4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd {{.*#+}} xmm2 = [1.0E+0,1.0E+0] +; SSE2-NEXT: mulpd %xmm2, %xmm0 +; SSE2-NEXT: mulpd %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; AVX2-LABEL: canon_fp64_varargsv4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX2-NEXT: vmulpd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: canon_fp64_varargsv4f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX512F-NEXT: vmulpd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq + %canonicalized = call <4 x double> @llvm.canonicalize.v4f32(<4 x double> %a) + ret <4 x double> %canonicalized +} + +define <2 x x86_fp80> @canon_fp80_varargsv2fp80(<2 x x86_fp80> %a) { +; SSE1-LABEL: canon_fp80_varargsv2fp80: +; SSE1: # %bb.0: +; SSE1-NEXT: fldt {{[0-9]+}}(%esp) +; SSE1-NEXT: fldt {{[0-9]+}}(%esp) +; SSE1-NEXT: fld1 +; SSE1-NEXT: fmul %st, %st(1) +; SSE1-NEXT: fmulp %st, %st(2) +; SSE1-NEXT: fxch %st(1) +; SSE1-NEXT: retl +; +; SSE2-LABEL: canon_fp80_varargsv2fp80: +; SSE2: # %bb.0: +; SSE2-NEXT: fldt {{[0-9]+}}(%rsp) +; SSE2-NEXT: fldt {{[0-9]+}}(%rsp) +; SSE2-NEXT: fld1 +; SSE2-NEXT: fmul %st, %st(1) +; SSE2-NEXT: fmulp %st, %st(2) +; SSE2-NEXT: fxch %st(1) +; SSE2-NEXT: retq +; +; AVX1-LABEL: canon_fp80_varargsv2fp80: +; AVX1: # %bb.0: +; AVX1-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX1-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX1-NEXT: fld1 +; AVX1-NEXT: fmul %st, %st(1) +; AVX1-NEXT: fmulp %st, %st(2) +; AVX1-NEXT: fxch %st(1) +; AVX1-NEXT: retq + %canonicalized = call <2 x x86_fp80> @llvm.canonicalize.v2f80(<2 x x86_fp80> %a) + ret <2 x x86_fp80> %canonicalized +} + +define void @vec_canonicalize_var_v4f32(<4 x float> addrspace(1)* %out) #1 { +; SSE1-LABEL: vec_canonicalize_var_v4f32: +; SSE1: # %bb.0: +; SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE1-NEXT: fld1 +; SSE1-NEXT: fld %st(0) +; SSE1-NEXT: fmuls (%eax) +; SSE1-NEXT: fld %st(1) +; SSE1-NEXT: fmuls 4(%eax) +; SSE1-NEXT: fld %st(2) +; SSE1-NEXT: fmuls 8(%eax) +; SSE1-NEXT: fxch %st(3) +; SSE1-NEXT: fmuls 12(%eax) +; SSE1-NEXT: fstps 12(%eax) +; SSE1-NEXT: fxch %st(2) +; SSE1-NEXT: fstps 8(%eax) +; SSE1-NEXT: fxch %st(1) +; SSE1-NEXT: fstps 4(%eax) +; SSE1-NEXT: fstps (%eax) +; SSE1-NEXT: retl +; +; SSE2-LABEL: vec_canonicalize_var_v4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movaps %xmm0, (%rdi) +; SSE2-NEXT: retq +; +; AVX2-LABEL: vec_canonicalize_var_v4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX2-NEXT: vmulps (%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vmovaps %xmm0, (%rdi) +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec_canonicalize_var_v4f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX512F-NEXT: vmulps (%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vmovaps %xmm0, (%rdi) +; AVX512F-NEXT: retq + %val = load <4 x float>, <4 x float> addrspace(1)* %out + %canonicalized = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> %val) + store <4 x float> %canonicalized, <4 x float> addrspace(1)* %out + ret void +} + +define void @vec_canonicalize_var_v4f64(<4 x double> addrspace(1)* %out) #1 { +; SSE1-LABEL: vec_canonicalize_var_v4f64: +; SSE1: # %bb.0: +; SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE1-NEXT: fld1 +; SSE1-NEXT: fld %st(0) +; SSE1-NEXT: fmull (%eax) +; SSE1-NEXT: fld %st(1) +; SSE1-NEXT: fmull 8(%eax) +; SSE1-NEXT: fld %st(2) +; SSE1-NEXT: fmull 16(%eax) +; SSE1-NEXT: fxch %st(3) +; SSE1-NEXT: fmull 24(%eax) +; SSE1-NEXT: fstpl 24(%eax) +; SSE1-NEXT: fxch %st(2) +; SSE1-NEXT: fstpl 16(%eax) +; SSE1-NEXT: fxch %st(1) +; SSE1-NEXT: fstpl 8(%eax) +; SSE1-NEXT: fstpl (%eax) +; SSE1-NEXT: retl +; +; SSE2-LABEL: vec_canonicalize_var_v4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd {{.*#+}} xmm0 = [1.0E+0,1.0E+0] +; SSE2-NEXT: movapd 16(%rdi), %xmm1 +; SSE2-NEXT: mulpd %xmm0, %xmm1 +; SSE2-NEXT: mulpd (%rdi), %xmm0 +; SSE2-NEXT: movapd %xmm0, (%rdi) +; SSE2-NEXT: movapd %xmm1, 16(%rdi) +; SSE2-NEXT: retq +; +; AVX2-LABEL: vec_canonicalize_var_v4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX2-NEXT: vmulpd (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vmovapd %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec_canonicalize_var_v4f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX512F-NEXT: vmulpd (%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovapd %ymm0, (%rdi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq + %val = load <4 x double>, <4 x double> addrspace(1)* %out + %canonicalized = call <4 x double> @llvm.canonicalize.v4f32(<4 x double> %val) + store <4 x double> %canonicalized, <4 x double> addrspace(1)* %out + ret void +} + +define void @vec_canonicalize_x86_fp80(<4 x x86_fp80> addrspace(1)* %out) #1 { +; SSE1-LABEL: vec_canonicalize_x86_fp80: +; SSE1: # %bb.0: +; SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE1-NEXT: fldt 30(%eax) +; SSE1-NEXT: fldt 20(%eax) +; SSE1-NEXT: fldt 10(%eax) +; SSE1-NEXT: fldt (%eax) +; SSE1-NEXT: fld1 +; SSE1-NEXT: fmul %st, %st(1) +; SSE1-NEXT: fmul %st, %st(2) +; SSE1-NEXT: fmul %st, %st(3) +; SSE1-NEXT: fmulp %st, %st(4) +; SSE1-NEXT: fxch %st(3) +; SSE1-NEXT: fstpt 30(%eax) +; SSE1-NEXT: fxch %st(1) +; SSE1-NEXT: fstpt 20(%eax) +; SSE1-NEXT: fstpt 10(%eax) +; SSE1-NEXT: fstpt (%eax) +; SSE1-NEXT: retl +; +; SSE2-LABEL: vec_canonicalize_x86_fp80: +; SSE2: # %bb.0: +; SSE2-NEXT: fldt 30(%rdi) +; SSE2-NEXT: fldt 20(%rdi) +; SSE2-NEXT: fldt 10(%rdi) +; SSE2-NEXT: fldt (%rdi) +; SSE2-NEXT: fld1 +; SSE2-NEXT: fmul %st, %st(1) +; SSE2-NEXT: fmul %st, %st(2) +; SSE2-NEXT: fmul %st, %st(3) +; SSE2-NEXT: fmulp %st, %st(4) +; SSE2-NEXT: fxch %st(3) +; SSE2-NEXT: fstpt 30(%rdi) +; SSE2-NEXT: fxch %st(1) +; SSE2-NEXT: fstpt 20(%rdi) +; SSE2-NEXT: fstpt 10(%rdi) +; SSE2-NEXT: fstpt (%rdi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec_canonicalize_x86_fp80: +; AVX1: # %bb.0: +; AVX1-NEXT: fldt 30(%rdi) +; AVX1-NEXT: fldt 20(%rdi) +; AVX1-NEXT: fldt 10(%rdi) +; AVX1-NEXT: fldt (%rdi) +; AVX1-NEXT: fld1 +; AVX1-NEXT: fmul %st, %st(1) +; AVX1-NEXT: fmul %st, %st(2) +; AVX1-NEXT: fmul %st, %st(3) +; AVX1-NEXT: fmulp %st, %st(4) +; AVX1-NEXT: fxch %st(3) +; AVX1-NEXT: fstpt 30(%rdi) +; AVX1-NEXT: fxch %st(1) +; AVX1-NEXT: fstpt 20(%rdi) +; AVX1-NEXT: fstpt 10(%rdi) +; AVX1-NEXT: fstpt (%rdi) +; AVX1-NEXT: retq + %val = load <4 x x86_fp80>, <4 x x86_fp80> addrspace(1)* %out + %canonicalized = call <4 x x86_fp80> @llvm.canonicalize.f80(<4 x x86_fp80> %val) + store <4 x x86_fp80> %canonicalized, <4 x x86_fp80> addrspace(1)* %out + ret void +}