Skip to content

Commit 0f92435

Browse files
committed
Add x86 encoding for SIMD imul
Only i16x8 and i32x4 are encoded in this commit mainly because i8x16 and i64x2 do not have simple encodings in x86. i64x2 is not required by the SIMD spec and there is discussion (WebAssembly/simd#98 (comment)) about removing i8x16.
1 parent c33cb52 commit 0f92435

File tree

4 files changed

+62
-4
lines changed

4 files changed

+62
-4
lines changed

cranelift-codegen/meta/src/isa/x86/encodings.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2012,6 +2012,19 @@ pub(crate) fn define(
20122012
e.enc_32_64(isub, rec_fa.opcodes(opcodes));
20132013
}
20142014

2015+
// SIMD integer multiplication: the x86 ISA does not have instructions for multiplying I8x16
2016+
// and I64x2 and these are (at the time of writing) not necessary for WASM SIMD.
2017+
for (ty, opcodes, isap) in [
2018+
(I16, vec![0x66, 0x0f, 0xd5], None), // PMULLW from SSE2
2019+
(I32, vec![0x66, 0x0f, 0x38, 0x40], Some(use_sse41_simd)), // PMULLD from SSE4.1
2020+
]
2021+
.iter()
2022+
.cloned()
2023+
{
2024+
let imul = imul.bind_vector_from_lane(ty, sse_vector_size);
2025+
e.enc_32_64_maybe_isap(imul, rec_fa.opcodes(opcodes), isap);
2026+
}
2027+
20152028
// Reference type instructions
20162029

20172030
// Null references implemented as iconst 0.

cranelift-codegen/meta/src/shared/instructions.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1682,8 +1682,7 @@ pub(crate) fn define(
16821682
Wrapping integer multiplication: `a := x y \pmod{2^B}`.
16831683
16841684
This instruction does not depend on the signed/unsigned interpretation
1685-
of the
1686-
operands.
1685+
of the operands.
16871686
16881687
Polymorphic over all integer types (vector and scalar).
16891688
"#,

cranelift-wasm/src/code_translator.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -986,6 +986,10 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
986986
let a = state.pop1();
987987
state.push1(builder.ins().ineg(a))
988988
}
989+
Operator::I16x8Mul | Operator::I32x4Mul => {
990+
let (a, b) = state.pop2();
991+
state.push1(builder.ins().imul(a, b))
992+
}
989993
Operator::V128Load { .. }
990994
| Operator::V128Store { .. }
991995
| Operator::V8x16Shuffle { .. }
@@ -1055,13 +1059,11 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
10551059
| Operator::I16x8AddSaturateU
10561060
| Operator::I16x8SubSaturateS
10571061
| Operator::I16x8SubSaturateU
1058-
| Operator::I16x8Mul
10591062
| Operator::I32x4AnyTrue
10601063
| Operator::I32x4AllTrue
10611064
| Operator::I32x4Shl
10621065
| Operator::I32x4ShrS
10631066
| Operator::I32x4ShrU
1064-
| Operator::I32x4Mul
10651067
| Operator::I64x2AnyTrue
10661068
| Operator::I64x2AllTrue
10671069
| Operator::I64x2Shl

filetests/isa/x86/simd-arithmetic.clif

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,3 +120,47 @@ ebb0:
120120

121121
return ; bin: c3
122122
}
123+
124+
function %imul_i32x4() -> b1 {
125+
ebb0:
126+
[-, %xmm0] v0 = vconst.i32x4 [-1 0 1 -2147483647] ; e.g. -2147483647 == 0x80_00_00_01
127+
[-, %xmm1] v1 = vconst.i32x4 [2 2 2 2]
128+
[-, %xmm0] v2 = imul v0, v1 ; bin: 66 0f 38 40 c1
129+
130+
v3 = extractlane v2, 0
131+
v4 = icmp_imm eq v3, -2
132+
133+
v5 = extractlane v2, 1
134+
v6 = icmp_imm eq v5, 0
135+
136+
v7 = extractlane v2, 3
137+
v8 = icmp_imm eq v7, 2 ; 0x80_00_00_01 * 2 == 0x1_00_00_00_02 (and the 1 is dropped)
138+
139+
v9 = band v4, v6
140+
v10 = band v8, v9
141+
return v10
142+
}
143+
; run
144+
145+
function %imul_i16x8() -> b1 {
146+
ebb0:
147+
[-, %xmm1] v0 = vconst.i16x8 [-1 0 1 32767 0 0 0 0] ; e.g. 32767 == 0x7f_ff
148+
[-, %xmm2] v1 = vconst.i16x8 [2 2 2 2 0 0 0 0]
149+
[-, %xmm1] v2 = imul v0, v1 ; bin: 66 0f d5 ca
150+
151+
v3 = extractlane v2, 0
152+
v4 = icmp_imm eq v3, 0xfffe ; TODO -2 will not work here and below because v3 is being
153+
; uextend-ed, not sextend-ed
154+
155+
v5 = extractlane v2, 1
156+
v6 = icmp_imm eq v5, 0
157+
158+
v7 = extractlane v2, 3
159+
v8 = icmp_imm eq v7, 0xfffe ; 0x7f_ff * 2 == 0xff_fe
160+
161+
v9 = band v4, v6
162+
v10 = band v8, v9
163+
164+
return v4
165+
}
166+
; run

0 commit comments

Comments
 (0)