Skip to content

Commit 6052a69

Browse files
CL/aarch64: implement the wasm SIMD i32x4.dot_i16x8_s instruction
This patch implements, for aarch64, the following wasm SIMD extensions i32x4.dot_i16x8_s instruction WebAssembly/simd#127 It also updates dependencies as follows, in order that the new instruction can be parsed, decoded, etc: wat to 1.0.27 wast to 26.0.1 wasmparser to 0.65.0 wasmprinter to 0.2.12 The changes are straightforward: * new CLIF instruction `widening_pairwise_dot_product_s` * translation from wasm into `widening_pairwise_dot_product_s` * new AArch64 instructions `smull`, `smull2` (part of the `VecRRR` group) * translation from `widening_pairwise_dot_product_s` to `smull ; smull2 ; addv` There is no testcase in this commit, because that is a separate repo. The implementation has been tested, nevertheless.
1 parent f6d5b87 commit 6052a69

File tree

24 files changed

+218
-50
lines changed

24 files changed

+218
-50
lines changed

Cargo.lock

Lines changed: 26 additions & 26 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,12 @@ anyhow = "1.0.19"
3737
target-lexicon = { version = "0.11.0", default-features = false }
3838
pretty_env_logger = "0.4.0"
3939
file-per-thread-logger = "0.1.1"
40-
wat = "1.0.26"
40+
wat = "1.0.27"
4141
libc = "0.2.60"
4242
log = "0.4.8"
4343
rayon = "1.2.1"
4444
humantime = "1.3.0"
45-
wasmparser = "0.63"
45+
wasmparser = "0.65"
4646

4747
[dev-dependencies]
4848
env_logger = "0.7.1"

cranelift/codegen/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ peepmatic-traits = { path = "../peepmatic/crates/traits", optional = true, versi
3030
peepmatic-runtime = { path = "../peepmatic/crates/runtime", optional = true, version = "0.67.0" }
3131
regalloc = { version = "0.0.31" }
3232
souper-ir = { version = "1", optional = true }
33-
wast = { version = "25.0.0", optional = true }
33+
wast = { version = "26.0.1", optional = true }
3434
# It is a goal of the cranelift-codegen crate to have minimal external dependencies.
3535
# Please don't add any unless they are essential to the task of creating binary
3636
# machine code. Integration tests that need external dependencies can be

cranelift/codegen/meta/src/shared/instructions.rs

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4078,6 +4078,41 @@ pub(crate) fn define(
40784078
.operands_out(vec![a]),
40794079
);
40804080

4081+
let I16x8 = &TypeVar::new(
4082+
"I16x8",
4083+
"A SIMD vector type containing 8 integer lanes each 16 bits wide.",
4084+
TypeSetBuilder::new()
4085+
.ints(16..16)
4086+
.simd_lanes(8..8)
4087+
.includes_scalars(false)
4088+
.build(),
4089+
);
4090+
4091+
let x = &Operand::new("x", I16x8);
4092+
let y = &Operand::new("y", I16x8);
4093+
let a = &Operand::new("a", &I16x8.merge_lanes());
4094+
4095+
ig.push(
4096+
Inst::new(
4097+
"widening_pairwise_dot_product_s",
4098+
r#"
4099+
Takes corresponding elements in `x` and `y`, performs a sign-extending length-doubling
4100+
multiplication on them, then adds adjacent pairs of elements to form the result. For
4101+
example, if the input vectors are `[x3, x2, x1, x0]` and `[y3, y2, y1, y0]`, it produces
4102+
the vector `[r1, r0]`, where `r1 = sx(x3) * sx(y3) + sx(x2) * sx(y2)` and
4103+
`r0 = sx(x1) * sx(y1) + sx(x0) * sx(y0)`, and `sx(n)` sign-extends `n` to twice its width.
4104+
4105+
This will double the lane width and halve the number of lanes. So the resulting
4106+
vector has the same number of bits as `x` and `y` do (individually).
4107+
4108+
See https://github.com/WebAssembly/simd/pull/127 for background info.
4109+
"#,
4110+
&formats.binary,
4111+
)
4112+
.operands_in(vec![x, y])
4113+
.operands_out(vec![a]),
4114+
);
4115+
40814116
let IntTo = &TypeVar::new(
40824117
"IntTo",
40834118
"A larger integer type with the same number of lanes",

cranelift/codegen/src/isa/aarch64/inst/args.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -660,6 +660,9 @@ impl VectorSize {
660660
}
661661
}
662662

663+
/// Produces a `VectorSize` with lanes twice as wide. Note that if the resulting
664+
/// size would exceed 128 bits, then the number of lanes is also halved, so as to
665+
/// ensure that the result size is at most 128 bits.
663666
pub fn widen(&self) -> VectorSize {
664667
match self {
665668
VectorSize::Size8x8 => VectorSize::Size16x8,
@@ -672,6 +675,7 @@ impl VectorSize {
672675
}
673676
}
674677

678+
/// Produces a `VectorSize` that has the same lane width, but half as many lanes.
675679
pub fn halve(&self) -> VectorSize {
676680
match self {
677681
VectorSize::Size8x16 => VectorSize::Size8x8,

cranelift/codegen/src/isa/aarch64/inst/emit.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1903,6 +1903,8 @@ impl MachInstEmit for Inst {
19031903
(0b001_01110_00_1 | enc_size << 1, 0b100000)
19041904
}
19051905
VecALUOp::Zip1 => (0b01001110_00_0 | enc_size << 1, 0b001110),
1906+
VecALUOp::Smull => (0b00001110_00_1 | enc_size << 1, 0b110000),
1907+
VecALUOp::Smull2 => (0b01001110_00_1 | enc_size << 1, 0b110000),
19061908
};
19071909
let top11 = if is_float {
19081910
top11 | enc_float_size << 1

cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3223,6 +3223,78 @@ fn test_aarch64_binemit() {
32233223
"zip1 v9.2d, v20.2d, v17.2d",
32243224
));
32253225

3226+
insns.push((
3227+
Inst::VecRRR {
3228+
alu_op: VecALUOp::Smull,
3229+
rd: writable_vreg(16),
3230+
rn: vreg(12),
3231+
rm: vreg(1),
3232+
size: VectorSize::Size8x16,
3233+
},
3234+
"90C1210E",
3235+
"smull v16.8h, v12.8b, v1.8b",
3236+
));
3237+
3238+
insns.push((
3239+
Inst::VecRRR {
3240+
alu_op: VecALUOp::Smull,
3241+
rd: writable_vreg(2),
3242+
rn: vreg(13),
3243+
rm: vreg(6),
3244+
size: VectorSize::Size16x8,
3245+
},
3246+
"A2C1660E",
3247+
"smull v2.4s, v13.4h, v6.4h",
3248+
));
3249+
3250+
insns.push((
3251+
Inst::VecRRR {
3252+
alu_op: VecALUOp::Smull,
3253+
rd: writable_vreg(8),
3254+
rn: vreg(12),
3255+
rm: vreg(14),
3256+
size: VectorSize::Size32x4,
3257+
},
3258+
"88C1AE0E",
3259+
"smull v8.2d, v12.2s, v14.2s",
3260+
));
3261+
3262+
insns.push((
3263+
Inst::VecRRR {
3264+
alu_op: VecALUOp::Smull2,
3265+
rd: writable_vreg(16),
3266+
rn: vreg(12),
3267+
rm: vreg(1),
3268+
size: VectorSize::Size8x16,
3269+
},
3270+
"90C1214E",
3271+
"smull2 v16.8h, v12.16b, v1.16b",
3272+
));
3273+
3274+
insns.push((
3275+
Inst::VecRRR {
3276+
alu_op: VecALUOp::Smull2,
3277+
rd: writable_vreg(2),
3278+
rn: vreg(13),
3279+
rm: vreg(6),
3280+
size: VectorSize::Size16x8,
3281+
},
3282+
"A2C1664E",
3283+
"smull2 v2.4s, v13.8h, v6.8h",
3284+
));
3285+
3286+
insns.push((
3287+
Inst::VecRRR {
3288+
alu_op: VecALUOp::Smull2,
3289+
rd: writable_vreg(8),
3290+
rn: vreg(12),
3291+
rm: vreg(14),
3292+
size: VectorSize::Size32x4,
3293+
},
3294+
"88C1AE4E",
3295+
"smull2 v8.2d, v12.4s, v14.4s",
3296+
));
3297+
32263298
insns.push((
32273299
Inst::VecMisc {
32283300
op: VecMisc2::Not,

0 commit comments

Comments
 (0)