Open
Description
In my real code, I have some 512-bit subtractions, some of which can be run in parallel. Some real code would inline to something like this:
// equivalent to sub(a, b) & sub(c, d)
export fn foo(a: @Vector(8, u64), b: @Vector(8, u64), c: @Vector(8, u64), d: @Vector(8, u64)) @Vector(8, u64) {
const c1: u8 = @as(u8, @bitCast(a < b));
const m1: u8 = @as(u8, @bitCast(a == b));
const s1 = a -% b;
const ans1 = @select(u64, @as(@Vector(8, bool), @bitCast(m1 ^ ((c1 << 1) +% m1))), s1 +% @as(@Vector(8, u64), @splat(0xffffffffffffffff)), s1);
const c2: u8 = @as(u8, @bitCast(c < d));
const m2: u8 = @as(u8, @bitCast(c == d));
const s2 = c -% d;
const ans2 = @select(u64, @as(@Vector(8, bool), @bitCast(m2 ^ ((c2 << 1) +% m2))), s2 +% @as(@Vector(8, u64), @splat(0xffffffffffffffff)), s2);
return ans1 & ans2;
}
Compiled for sifive_x280, we get:
foo:
- vsetivli zero, 8, e64, m1, ta, ma
vle64.v v9, (a1)
vmv.v.i v8, 0
vle64.v v10, (a2)
vmsltu.vv v11, v9, v10
vmseq.vv v12, v9, v10
- vsetvli zero, zero, e8, mf8, ta, ma
vmv.x.s a1, v11
vle64.v v11, (a3)
vmv.x.s a2, v12
vle64.v v12, (a4)
sh1add a1, a1, a2
xor a1, a1, a2
vmv.s.x v0, a1
- vsetvli zero, zero, e64, m1, ta, ma
vmsltu.vv v14, v11, v12
vmseq.vv v15, v11, v12
vmerge.vim v13, v8, 1, v0
- vsetvli zero, zero, e8, mf8, ta, ma
vmv.x.s a1, v14
vmv.x.s a2, v15
sh1add a1, a1, a2
xor a1, a1, a2
vmv.s.x v0, a1
- vsetvli zero, zero, e64, m1, ta, ma
vsub.vv v9, v9, v10
vsub.vv v10, v11, v12
vmerge.vim v8, v8, 1, v0
vsub.vv v9, v9, v13
vsub.vv v8, v10, v8
vand.vv v8, v8, v9
vse64.v v8, (a0)
ret
Notice how we have 5 vsetivli
/vsetvli
? That seems a bit unnecessary. With manual interleaving:
// equivalent to sub(a, b) & sub(c, d)
export fn bar(a: @Vector(8, u64), b: @Vector(8, u64), c: @Vector(8, u64), d: @Vector(8, u64)) @Vector(8, u64) {
const c1: u8 = @as(u8, @bitCast(a < b));
const m1: u8 = @as(u8, @bitCast(a == b));
const c2: u8 = @as(u8, @bitCast(c < d));
const m2: u8 = @as(u8, @bitCast(c == d));
const s1 = a -% b;
const s2 = c -% d;
const ans1 = @select(u64, @as(@Vector(8, bool), @bitCast(m1 ^ ((c1 << 1) +% m1))), s1 +% @as(@Vector(8, u64), @splat(0xffffffffffffffff)), s1);
const ans2 = @select(u64, @as(@Vector(8, bool), @bitCast(m2 ^ ((c2 << 1) +% m2))), s2 +% @as(@Vector(8, u64), @splat(0xffffffffffffffff)), s2);
return ans1 & ans2;
}
We get:
bar:
+ vsetivli zero, 8, e64, m1, ta, ma
vle64.v v10, (a1)
vmv.v.i v9, 0
vle64.v v11, (a2)
vle64.v v12, (a3)
vle64.v v13, (a4)
vmsltu.vv v8, v10, v11
vmseq.vv v14, v10, v11
vmsltu.vv v15, v12, v13
vmseq.vv v16, v12, v13
+ vsetvli zero, zero, e8, mf8, ta, ma
vmv.x.s a1, v8
vmv.x.s a2, v14
vmv.x.s a3, v15
vmv.x.s a4, v16
sh1add a1, a1, a2
xor a1, a1, a2
sh1add a2, a3, a4
xor a2, a2, a4
vmv.s.x v0, a1
vmv.s.x v8, a2
+ vsetvli zero, zero, e64, m1, ta, ma
vsub.vv v10, v10, v11
vmerge.vim v11, v9, 1, v0
- vmv1r.v v0, v8
vsub.vv v8, v12, v13
vmerge.vim v9, v9, 1, v0
vsub.vv v10, v10, v11
vsub.vv v8, v8, v9
vand.vv v8, v8, v10
vse64.v v8, (a0)
ret
Now there are only 3 vsetivli
/vsetvli
instructions! We do have an extra vmv1r.v
inserted in this version, but my feeling is that that is less expensive than the 2 vsetvli
we eliminated.