Skip to content

FMA instruction #1438

@9il

Description

@9il

Hello,

I am looking for a portable way of vectorized FMA operations for BLAS.

Variant 1

fast attribute

double dot(double[] a, double[] b)
{
    typeof(return) s = 0;

    foreach(size_t i; 0..c.length)
    {
        s = inlineIR!(`
        %p = fmul fast double %0, %1
        %r = fadd fast double %p, %2
        ret double %r`, double)(a[i], b[i], s);
    }

    return s;
}

Result: No FMA

LBB0_8:
    vmovupd -224(%rax), %ymm4
    vmovupd -192(%rax), %ymm5
    vmovupd -160(%rax), %ymm6
    vmovupd -128(%rax), %ymm7
    vmulpd  -224(%rsi), %ymm4, %ymm4
    vmulpd  -192(%rsi), %ymm5, %ymm5
    vmulpd  -160(%rsi), %ymm6, %ymm6
    vmulpd  -128(%rsi), %ymm7, %ymm7
    vaddpd  %ymm0, %ymm4, %ymm0
    vaddpd  %ymm1, %ymm5, %ymm1
    vaddpd  %ymm2, %ymm6, %ymm2
    vaddpd  %ymm3, %ymm7, %ymm3
    vmovupd -96(%rax), %ymm4
    vmovupd -64(%rax), %ymm5
    vmovupd -32(%rax), %ymm6
    vmovupd (%rax), %ymm7
    vmulpd  -96(%rsi), %ymm4, %ymm4
    vmulpd  -64(%rsi), %ymm5, %ymm5
    vmulpd  -32(%rsi), %ymm6, %ymm6
    vmulpd  (%rsi), %ymm7, %ymm7
    vaddpd  %ymm0, %ymm4, %ymm0
    vaddpd  %ymm1, %ymm5, %ymm1
    vaddpd  %ymm2, %ymm6, %ymm2
    vaddpd  %ymm3, %ymm7, %ymm3
    addq    $256, %rsi
    addq    $256, %rax
    addq    $-32, %rdx
    jne LBB0_8

Variant 2

llvm_fmuladd function

double dot(double[] a, double[] b)
{
    typeof(return) s = 0;

    foreach(size_t i; 0..a.length)
    {
        s = llvm_fmuladd(a[i], b[i], s);
    }

    return s;
}

Result: No Vectorization

LBB0_7:
    vmovsd  -56(%rcx), %xmm1
    vmovsd  -48(%rcx), %xmm2
    vfmadd132sd -56(%rax), %xmm0, %xmm1
    vfmadd231sd -48(%rax), %xmm2, %xmm1
    vmovsd  -40(%rcx), %xmm0
    vfmadd132sd -40(%rax), %xmm1, %xmm0
    vmovsd  -32(%rcx), %xmm1
    vfmadd132sd -32(%rax), %xmm0, %xmm1
    vmovsd  -24(%rcx), %xmm0
    vfmadd132sd -24(%rax), %xmm1, %xmm0
    vmovsd  -16(%rcx), %xmm1
    vfmadd132sd -16(%rax), %xmm0, %xmm1
    vmovsd  -8(%rcx), %xmm2
    vfmadd132sd -8(%rax), %xmm1, %xmm2
    vmovsd  (%rcx), %xmm0
    vfmadd132sd (%rax), %xmm2, %xmm0
    addq    $64, %rax
    addq    $64, %rcx
    addq    $-8, %rdx
    jne LBB0_7

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions