Skip to content

[RISC-V V] Idea: Interleave independent op-chains by vsetvli category #142814

Open
@Validark

Description

@Validark

Zig Godbolt

LLC Godbolt

In my real code, I have some 512-bit subtractions, some of which can be run in parallel. Some real code would inline to something like this:

// equivalent to sub(a, b) & sub(c, d)
export fn foo(a: @Vector(8, u64), b: @Vector(8, u64), c: @Vector(8, u64), d: @Vector(8, u64)) @Vector(8, u64) {
    const c1: u8 = @as(u8, @bitCast(a < b));
    const m1: u8 = @as(u8, @bitCast(a == b));
    const s1 = a -% b;
    const ans1 = @select(u64, @as(@Vector(8, bool), @bitCast(m1 ^ ((c1 << 1) +% m1))), s1 +% @as(@Vector(8, u64), @splat(0xffffffffffffffff)), s1);


    const c2: u8 = @as(u8, @bitCast(c < d));
    const m2: u8 = @as(u8, @bitCast(c == d));
    const s2 = c -% d;
    const ans2 = @select(u64, @as(@Vector(8, bool), @bitCast(m2 ^ ((c2 << 1) +% m2))), s2 +% @as(@Vector(8, u64), @splat(0xffffffffffffffff)), s2);

    return ans1 & ans2;
}

Compiled for sifive_x280, we get:

foo:
-       vsetivli        zero, 8, e64, m1, ta, ma
        vle64.v v9, (a1)
        vmv.v.i v8, 0
        vle64.v v10, (a2)
        vmsltu.vv       v11, v9, v10
        vmseq.vv        v12, v9, v10
-       vsetvli zero, zero, e8, mf8, ta, ma
        vmv.x.s a1, v11
        vle64.v v11, (a3)
        vmv.x.s a2, v12
        vle64.v v12, (a4)
        sh1add  a1, a1, a2
        xor     a1, a1, a2
        vmv.s.x v0, a1
-       vsetvli zero, zero, e64, m1, ta, ma
        vmsltu.vv       v14, v11, v12
        vmseq.vv        v15, v11, v12
        vmerge.vim      v13, v8, 1, v0
-       vsetvli zero, zero, e8, mf8, ta, ma
        vmv.x.s a1, v14
        vmv.x.s a2, v15
        sh1add  a1, a1, a2
        xor     a1, a1, a2
        vmv.s.x v0, a1
-       vsetvli zero, zero, e64, m1, ta, ma
        vsub.vv v9, v9, v10
        vsub.vv v10, v11, v12
        vmerge.vim      v8, v8, 1, v0
        vsub.vv v9, v9, v13
        vsub.vv v8, v10, v8
        vand.vv v8, v8, v9
        vse64.v v8, (a0)
        ret

Notice how we have 5 vsetivli/vsetvli? That seems a bit unnecessary. With manual interleaving:

// equivalent to sub(a, b) & sub(c, d)
export fn bar(a: @Vector(8, u64), b: @Vector(8, u64), c: @Vector(8, u64), d: @Vector(8, u64)) @Vector(8, u64) {
    const c1: u8 = @as(u8, @bitCast(a < b));
    const m1: u8 = @as(u8, @bitCast(a == b));
    const c2: u8 = @as(u8, @bitCast(c < d));
    const m2: u8 = @as(u8, @bitCast(c == d));

    const s1 = a -% b;
    const s2 = c -% d;

    const ans1 = @select(u64, @as(@Vector(8, bool), @bitCast(m1 ^ ((c1 << 1) +% m1))), s1 +% @as(@Vector(8, u64), @splat(0xffffffffffffffff)), s1);
    const ans2 = @select(u64, @as(@Vector(8, bool), @bitCast(m2 ^ ((c2 << 1) +% m2))), s2 +% @as(@Vector(8, u64), @splat(0xffffffffffffffff)), s2);

    return ans1 & ans2;
}

We get:

bar:
+       vsetivli        zero, 8, e64, m1, ta, ma
        vle64.v v10, (a1)
        vmv.v.i v9, 0
        vle64.v v11, (a2)
        vle64.v v12, (a3)
        vle64.v v13, (a4)
        vmsltu.vv       v8, v10, v11
        vmseq.vv        v14, v10, v11
        vmsltu.vv       v15, v12, v13
        vmseq.vv        v16, v12, v13
+       vsetvli zero, zero, e8, mf8, ta, ma
        vmv.x.s a1, v8
        vmv.x.s a2, v14
        vmv.x.s a3, v15
        vmv.x.s a4, v16
        sh1add  a1, a1, a2
        xor     a1, a1, a2
        sh1add  a2, a3, a4
        xor     a2, a2, a4
        vmv.s.x v0, a1
        vmv.s.x v8, a2
+       vsetvli zero, zero, e64, m1, ta, ma
        vsub.vv v10, v10, v11
        vmerge.vim      v11, v9, 1, v0
-       vmv1r.v v0, v8
        vsub.vv v8, v12, v13
        vmerge.vim      v9, v9, 1, v0
        vsub.vv v10, v10, v11
        vsub.vv v8, v8, v9
        vand.vv v8, v8, v10
        vse64.v v8, (a0)
        ret

Now there are only 3 vsetivli/vsetvli instructions! We do have an extra vmv1r.v inserted in this version, but my feeling is that that is less expensive than the 2 vsetvli we eliminated.

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions