Skip to content

Commit 3e9fb97

Browse files
committed
Add x86 encoding for SIMD imul
Only i16x8 and i32x4 are encoded in this commit mainly because i8x16 and i64x2 do not have simple encodings in x86. i64x2 is not required by the SIMD spec and there is discussion (WebAssembly/simd#98 (comment)) about removing i8x16.
1 parent 1ed09e8 commit 3e9fb97

File tree

4 files changed

+62
-4
lines changed

4 files changed

+62
-4
lines changed

cranelift-codegen/meta/src/isa/x86/encodings.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2032,6 +2032,19 @@ pub(crate) fn define(
20322032
e.enc_32_64(isub, rec_fa.opcodes(opcodes));
20332033
}
20342034

2035+
// SIMD integer multiplication: the x86 ISA does not have instructions for multiplying I8x16
2036+
// and I64x2 and these are (at the time of writing) not necessary for WASM SIMD.
2037+
for (ty, opcodes, isap) in [
2038+
(I16, vec![0x66, 0x0f, 0xd5], None), // PMULLW from SSE2
2039+
(I32, vec![0x66, 0x0f, 0x38, 0x40], Some(use_sse41_simd)), // PMULLD from SSE4.1
2040+
]
2041+
.iter()
2042+
.cloned()
2043+
{
2044+
let imul = imul.bind_vector_from_lane(ty, sse_vector_size);
2045+
e.enc_32_64_maybe_isap(imul, rec_fa.opcodes(opcodes), isap);
2046+
}
2047+
20352048
// Reference type instructions
20362049

20372050
// Null references implemented as iconst 0.

cranelift-codegen/meta/src/shared/instructions.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1717,8 +1717,7 @@ pub(crate) fn define(
17171717
Wrapping integer multiplication: `a := x y \pmod{2^B}`.
17181718
17191719
This instruction does not depend on the signed/unsigned interpretation
1720-
of the
1721-
operands.
1720+
of the operands.
17221721
17231722
Polymorphic over all integer types (vector and scalar).
17241723
"#,

cranelift-wasm/src/code_translator.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -998,6 +998,10 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
998998
let a = state.pop1();
999999
state.push1(builder.ins().ineg(a))
10001000
}
1001+
Operator::I16x8Mul | Operator::I32x4Mul => {
1002+
let (a, b) = state.pop2();
1003+
state.push1(builder.ins().imul(a, b))
1004+
}
10011005
Operator::V128Load { .. }
10021006
| Operator::V128Store { .. }
10031007
| Operator::I8x16Eq
@@ -1066,13 +1070,11 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
10661070
| Operator::I16x8AddSaturateU
10671071
| Operator::I16x8SubSaturateS
10681072
| Operator::I16x8SubSaturateU
1069-
| Operator::I16x8Mul
10701073
| Operator::I32x4AnyTrue
10711074
| Operator::I32x4AllTrue
10721075
| Operator::I32x4Shl
10731076
| Operator::I32x4ShrS
10741077
| Operator::I32x4ShrU
1075-
| Operator::I32x4Mul
10761078
| Operator::I64x2AnyTrue
10771079
| Operator::I64x2AllTrue
10781080
| Operator::I64x2Shl

filetests/isa/x86/simd-arithmetic.clif

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,3 +120,47 @@ ebb0:
120120

121121
return ; bin: c3
122122
}
123+
124+
function %imul_i32x4() -> b1 {
125+
ebb0:
126+
[-, %xmm0] v0 = vconst.i32x4 [-1 0 1 -2147483647] ; e.g. -2147483647 == 0x80_00_00_01
127+
[-, %xmm1] v1 = vconst.i32x4 [2 2 2 2]
128+
[-, %xmm0] v2 = imul v0, v1 ; bin: 66 0f 38 40 c1
129+
130+
v3 = extractlane v2, 0
131+
v4 = icmp_imm eq v3, -2
132+
133+
v5 = extractlane v2, 1
134+
v6 = icmp_imm eq v5, 0
135+
136+
v7 = extractlane v2, 3
137+
v8 = icmp_imm eq v7, 2 ; 0x80_00_00_01 * 2 == 0x1_00_00_00_02 (and the 1 is dropped)
138+
139+
v9 = band v4, v6
140+
v10 = band v8, v9
141+
return v10
142+
}
143+
; run
144+
145+
function %imul_i16x8() -> b1 {
146+
ebb0:
147+
[-, %xmm1] v0 = vconst.i16x8 [-1 0 1 32767 0 0 0 0] ; e.g. 32767 == 0x7f_ff
148+
[-, %xmm2] v1 = vconst.i16x8 [2 2 2 2 0 0 0 0]
149+
[-, %xmm1] v2 = imul v0, v1 ; bin: 66 0f d5 ca
150+
151+
v3 = extractlane v2, 0
152+
v4 = icmp_imm eq v3, 0xfffe ; TODO -2 will not work here and below because v3 is being
153+
; uextend-ed, not sextend-ed
154+
155+
v5 = extractlane v2, 1
156+
v6 = icmp_imm eq v5, 0
157+
158+
v7 = extractlane v2, 3
159+
v8 = icmp_imm eq v7, 0xfffe ; 0x7f_ff * 2 == 0xff_fe
160+
161+
v9 = band v4, v6
162+
v10 = band v8, v9
163+
164+
return v4
165+
}
166+
; run

0 commit comments

Comments
 (0)