Skip to content

Commit 61cab5d

Browse files
gwennalexcrichton
authored andcommitted
ssse3 (#224)
* ssse3: _mm_abs_pi8 failing Intrinsic has incorrect return type! <8 x i8> (<8 x i8>)* @llvm.x86.ssse3.pabs.b * Introduce a x86_mmx type And make it compatible with i8x8 and u8x8. Alex suggested to change the i8x8 declaration as: ``` struct i8x8(i64); ``` But I don't see how to make it compatible with the existing code/macros. * ssse3: _mm_abs_pi16, _mm_abs_pi32, _mm_shuffle_pi8 * ssse3: _mm_abs_pi16, _mm_abs_pi32, _mm_shuffle_pi8 tests * Replace x86_mmx by __m64 * ssse3: _mm_sign_pi8, _mm_sign_pi16, _mm_sign_pi32 * ssse3: _mm_mulhrs_pi16 * ssse3: _mm_maddubs_pi16 * ssse3: _mm_hsub_pi16, _mm_hsub_pi32, _mm_hsubs_pi16 * ssse3: _mm_hadd_pi16, _mm_hadd_pi32, _mm_hadds_pi16 * Move some ssse3 intrinsics from i586 to i686
1 parent f8f6797 commit 61cab5d

File tree

2 files changed

+375
-0
lines changed

2 files changed

+375
-0
lines changed

coresimd/src/x86/i686/mod.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@ pub use self::sse::*;
99
mod sse2;
1010
pub use self::sse2::*;
1111

12+
mod ssse3;
13+
pub use self::ssse3::*;
14+
1215
mod sse41;
1316
pub use self::sse41::*;
1417

coresimd/src/x86/i686/ssse3.rs

Lines changed: 372 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,372 @@
1+
//! Supplemental Streaming SIMD Extensions 3 (SSSE3)
2+
3+
#[cfg(test)]
4+
use stdsimd_test::assert_instr;
5+
6+
use core::mem;
7+
use v64::*;
8+
use x86::__m64;
9+
10+
/// Compute the absolute value of packed 8-bit integers in `a` and
11+
/// return the unsigned results.
12+
#[inline(always)]
13+
#[target_feature = "+ssse3"]
14+
#[cfg_attr(test, assert_instr(pabsb))]
15+
pub unsafe fn _mm_abs_pi8(a: i8x8) -> u8x8 {
16+
mem::transmute(pabsb(mem::transmute(a)))
17+
}
18+
19+
/// Compute the absolute value of packed 8-bit integers in `a`, and return the
20+
/// unsigned results.
21+
#[inline(always)]
22+
#[target_feature = "+ssse3"]
23+
#[cfg_attr(test, assert_instr(pabsw))]
24+
pub unsafe fn _mm_abs_pi16(a: i16x4) -> u16x4 {
25+
mem::transmute(pabsw(mem::transmute(a)))
26+
}
27+
28+
/// Compute the absolute value of packed 32-bit integers in `a`, and return the
29+
/// unsigned results.
30+
#[inline(always)]
31+
#[target_feature = "+ssse3"]
32+
#[cfg_attr(test, assert_instr(pabsd))]
33+
pub unsafe fn _mm_abs_pi32(a: i32x2) -> u32x2 {
34+
mem::transmute(pabsd(mem::transmute(a)))
35+
}
36+
37+
/// Shuffle packed 8-bit integers in `a` according to shuffle control mask in
38+
/// the corresponding 8-bit element of `b`, and return the results
39+
#[inline(always)]
40+
#[target_feature = "+ssse3"]
41+
#[cfg_attr(test, assert_instr(pshufb))]
42+
pub unsafe fn _mm_shuffle_pi8(a: u8x8, b: u8x8) -> u8x8 {
43+
mem::transmute(pshufb(mem::transmute(a), mem::transmute(b)))
44+
}
45+
46+
/// Concatenates the two 64-bit integer vector operands, and right-shifts
47+
/// the result by the number of bytes specified in the immediate operand.
48+
/*#[inline(always)]
49+
#[target_feature = "+ssse3"]
50+
#[cfg_attr(test, assert_instr(palignr, n = 15))]
51+
pub unsafe fn _mm_alignr_pi8(a: i8x8, b: i8x8, n: i32) -> i8x8 {
52+
mem::transmute(palignrb(mem::transmute(a), mem::transmute(b), n))
53+
}*/
54+
55+
/// Horizontally add the adjacent pairs of values contained in 2 packed
56+
/// 64-bit vectors of [4 x i16].
57+
#[inline(always)]
58+
#[target_feature = "+ssse3"]
59+
#[cfg_attr(test, assert_instr(phaddw))]
60+
pub unsafe fn _mm_hadd_pi16(a: i16x4, b: i16x4) -> i16x4 {
61+
mem::transmute(phaddw(mem::transmute(a), mem::transmute(b)))
62+
}
63+
64+
/// Horizontally add the adjacent pairs of values contained in 2 packed
65+
/// 64-bit vectors of [2 x i32].
66+
#[inline(always)]
67+
#[target_feature = "+ssse3"]
68+
#[cfg_attr(test, assert_instr(phaddd))]
69+
pub unsafe fn _mm_hadd_pi32(a: i32x2, b: i32x2) -> i32x2 {
70+
mem::transmute(phaddd(mem::transmute(a), mem::transmute(b)))
71+
}
72+
73+
/// Horizontally add the adjacent pairs of values contained in 2 packed
74+
/// 64-bit vectors of [4 x i16]. Positive sums greater than 7FFFh are
75+
/// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
76+
#[inline(always)]
77+
#[target_feature = "+ssse3"]
78+
#[cfg_attr(test, assert_instr(phaddsw))]
79+
pub unsafe fn _mm_hadds_pi16(a: i16x4, b: i16x4) -> i16x4 {
80+
mem::transmute(phaddsw(mem::transmute(a), mem::transmute(b)))
81+
}
82+
83+
/// Horizontally subtracts the adjacent pairs of values contained in 2
84+
/// packed 64-bit vectors of [4 x i16].
85+
#[inline(always)]
86+
#[target_feature = "+ssse3"]
87+
#[cfg_attr(test, assert_instr(phsubsw))]
88+
pub unsafe fn _mm_hsub_pi16(a: i16x4, b: i16x4) -> i16x4 {
89+
mem::transmute(phsubsw(mem::transmute(a), mem::transmute(b)))
90+
}
91+
92+
/// Horizontally subtracts the adjacent pairs of values contained in 2
93+
/// packed 64-bit vectors of [2 x i32].
94+
#[inline(always)]
95+
#[target_feature = "+ssse3"]
96+
#[cfg_attr(test, assert_instr(phsubd))]
97+
pub unsafe fn _mm_hsub_pi32(a: i32x2, b: i32x2) -> i32x2 {
98+
mem::transmute(phsubd(mem::transmute(a), mem::transmute(b)))
99+
}
100+
101+
/// Horizontally subtracts the adjacent pairs of values contained in 2
102+
/// packed 64-bit vectors of [4 x i16]. Positive differences greater than
103+
/// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
104+
/// saturated to 8000h.
105+
#[inline(always)]
106+
#[target_feature = "+ssse3"]
107+
#[cfg_attr(test, assert_instr(phsubsw))]
108+
pub unsafe fn _mm_hsubs_pi16(a: i16x4, b: i16x4) -> i16x4 {
109+
mem::transmute(phsubsw(mem::transmute(a), mem::transmute(b)))
110+
}
111+
112+
/// Multiplies corresponding pairs of packed 8-bit unsigned integer
113+
/// values contained in the first source operand and packed 8-bit signed
114+
/// integer values contained in the second source operand, adds pairs of
115+
/// contiguous products with signed saturation, and writes the 16-bit sums to
116+
/// the corresponding bits in the destination.
117+
#[inline(always)]
118+
#[target_feature = "+ssse3"]
119+
#[cfg_attr(test, assert_instr(pmaddubsw))]
120+
pub unsafe fn _mm_maddubs_pi16(a: u8x8, b: i8x8) -> i16x4 {
121+
mem::transmute(pmaddubsw(mem::transmute(a), mem::transmute(b)))
122+
}
123+
124+
/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
125+
/// products to the 18 most significant bits by right-shifting, rounds the
126+
/// truncated value by adding 1, and writes bits [16:1] to the destination.
127+
#[inline(always)]
128+
#[target_feature = "+ssse3"]
129+
#[cfg_attr(test, assert_instr(pmulhrsw))]
130+
pub unsafe fn _mm_mulhrs_pi16(a: i16x4, b: i16x4) -> i16x4 {
131+
mem::transmute(pmulhrsw(mem::transmute(a), mem::transmute(b)))
132+
}
133+
134+
/// Negate packed 8-bit integers in `a` when the corresponding signed 8-bit
135+
/// integer in `b` is negative, and return the results.
136+
/// Element in result are zeroed out when the corresponding element in `b` is
137+
/// zero.
138+
#[inline(always)]
139+
#[target_feature = "+ssse3"]
140+
#[cfg_attr(test, assert_instr(psignb))]
141+
pub unsafe fn _mm_sign_pi8(a: i8x8, b: i8x8) -> i8x8 {
142+
mem::transmute(psignb(mem::transmute(a), mem::transmute(b)))
143+
}
144+
145+
/// Negate packed 16-bit integers in `a` when the corresponding signed 16-bit
146+
/// integer in `b` is negative, and return the results.
147+
/// Element in result are zeroed out when the corresponding element in `b` is
148+
/// zero.
149+
#[inline(always)]
150+
#[target_feature = "+ssse3"]
151+
#[cfg_attr(test, assert_instr(psignw))]
152+
pub unsafe fn _mm_sign_pi16(a: i16x4, b: i16x4) -> i16x4 {
153+
mem::transmute(psignw(mem::transmute(a), mem::transmute(b)))
154+
}
155+
156+
/// Negate packed 32-bit integers in `a` when the corresponding signed 32-bit
157+
/// integer in `b` is negative, and return the results.
158+
/// Element in result are zeroed out when the corresponding element in `b` is
159+
/// zero.
160+
#[inline(always)]
161+
#[target_feature = "+ssse3"]
162+
#[cfg_attr(test, assert_instr(psignd))]
163+
pub unsafe fn _mm_sign_pi32(a: i32x2, b: i32x2) -> i32x2 {
164+
mem::transmute(psignd(mem::transmute(a), mem::transmute(b)))
165+
}
166+
167+
#[allow(improper_ctypes)]
168+
extern "C" {
169+
#[link_name = "llvm.x86.ssse3.pabs.b"]
170+
fn pabsb(a: __m64) -> __m64;
171+
172+
#[link_name = "llvm.x86.ssse3.pabs.w"]
173+
fn pabsw(a: __m64) -> __m64;
174+
175+
#[link_name = "llvm.x86.ssse3.pabs.d"]
176+
fn pabsd(a: __m64) -> __m64;
177+
178+
#[link_name = "llvm.x86.ssse3.pshuf.b"]
179+
fn pshufb(a: __m64, b: __m64) -> __m64;
180+
181+
/*#[link_name = "llvm.x86.mmx.palignr.b"]
182+
fn palignrb(a: __m64, b: __m64, n: i32) -> __m64;*/
183+
184+
#[link_name = "llvm.x86.ssse3.phadd.w"]
185+
fn phaddw(a: __m64, b: __m64) -> __m64;
186+
187+
#[link_name = "llvm.x86.ssse3.phadd.d"]
188+
fn phaddd(a: __m64, b: __m64) -> __m64;
189+
190+
#[link_name = "llvm.x86.ssse3.phadd.sw"]
191+
fn phaddsw(a: __m64, b: __m64) -> __m64;
192+
193+
#[link_name = "llvm.x86.ssse3.phsub.w"]
194+
fn phsubw(a: __m64, b: __m64) -> __m64;
195+
196+
#[link_name = "llvm.x86.ssse3.phsub.d"]
197+
fn phsubd(a: __m64, b: __m64) -> __m64;
198+
199+
#[link_name = "llvm.x86.ssse3.phsub.sw"]
200+
fn phsubsw(a: __m64, b: __m64) -> __m64;
201+
202+
#[link_name = "llvm.x86.ssse3.pmadd.ub.sw"]
203+
fn pmaddubsw(a: __m64, b: __m64) -> __m64;
204+
205+
#[link_name = "llvm.x86.ssse3.pmul.hr.sw"]
206+
fn pmulhrsw(a: __m64, b: __m64) -> __m64;
207+
208+
#[link_name = "llvm.x86.ssse3.psign.b"]
209+
fn psignb(a: __m64, b: __m64) -> __m64;
210+
211+
#[link_name = "llvm.x86.ssse3.psign.w"]
212+
fn psignw(a: __m64, b: __m64) -> __m64;
213+
214+
#[link_name = "llvm.x86.ssse3.psign.d"]
215+
fn psignd(a: __m64, b: __m64) -> __m64;
216+
}
217+
218+
#[cfg(test)]
219+
mod tests {
220+
use stdsimd_test::simd_test;
221+
222+
use v64::*;
223+
use x86::i686::ssse3;
224+
225+
#[simd_test = "ssse3"]
226+
unsafe fn _mm_abs_pi8() {
227+
let r = ssse3::_mm_abs_pi8(i8x8::splat(-5));
228+
assert_eq!(r, u8x8::splat(5));
229+
}
230+
231+
#[simd_test = "ssse3"]
232+
unsafe fn _mm_abs_pi16() {
233+
let r = ssse3::_mm_abs_pi16(i16x4::splat(-5));
234+
assert_eq!(r, u16x4::splat(5));
235+
}
236+
237+
#[simd_test = "ssse3"]
238+
unsafe fn _mm_abs_pi32() {
239+
let r = ssse3::_mm_abs_pi32(i32x2::splat(-5));
240+
assert_eq!(r, u32x2::splat(5));
241+
}
242+
243+
#[simd_test = "ssse3"]
244+
unsafe fn _mm_shuffle_pi8() {
245+
let a = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
246+
let b = u8x8::new(4, 128, 4, 3, 24, 12, 6, 19);
247+
let expected = u8x8::new(5, 0, 5, 4, 1, 5, 7, 4);
248+
let r = ssse3::_mm_shuffle_pi8(a, b);
249+
assert_eq!(r, expected);
250+
}
251+
252+
/*#[simd_test = "ssse3"]
253+
unsafe fn _mm_alignr_pi8() {
254+
let a = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
255+
let b = i8x8::new(4, 63, 4, 3, 24, 12, 6, 19);
256+
let r = ssse3::_mm_alignr_pi8(a, b, 33);
257+
assert_eq!(r, i8x8::splat(0));
258+
259+
let r = ssse3::_mm_alignr_pi8(a, b, 17);
260+
let expected = i8x8::new(2, 3, 4, 5, 6, 7, 8, 0);
261+
assert_eq!(r, expected);
262+
263+
let r = ssse3::_mm_alignr_pi8(a, b, 16);
264+
assert_eq!(r, a);
265+
266+
let r = ssse3::_mm_alignr_pi8(a, b, 15);
267+
let expected = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
268+
assert_eq!(r, expected);
269+
270+
let r = ssse3::_mm_alignr_pi8(a, b, 0);
271+
assert_eq!(r, b);
272+
}*/
273+
274+
#[simd_test = "ssse3"]
275+
unsafe fn _mm_hadd_pi16() {
276+
let a = i16x4::new(1, 2, 3, 4);
277+
let b = i16x4::new(4, 128, 4, 3);
278+
let expected = i16x4::new(3, 7, 132, 7);
279+
let r = ssse3::_mm_hadd_pi16(a, b);
280+
assert_eq!(r, expected);
281+
}
282+
283+
#[simd_test = "ssse3"]
284+
unsafe fn _mm_hadd_pi32() {
285+
let a = i32x2::new(1, 2);
286+
let b = i32x2::new(4, 128);
287+
let expected = i32x2::new(3, 132);
288+
let r = ssse3::_mm_hadd_pi32(a, b);
289+
assert_eq!(r, expected);
290+
}
291+
292+
#[simd_test = "ssse3"]
293+
unsafe fn _mm_hadds_pi16() {
294+
let a = i16x4::new(1, 2, 3, 4);
295+
let b = i16x4::new(32767, 1, -32768, -1);
296+
let expected = i16x4::new(3, 7, 32767, -32768);
297+
let r = ssse3::_mm_hadds_pi16(a, b);
298+
assert_eq!(r, expected);
299+
}
300+
301+
#[simd_test = "ssse3"]
302+
unsafe fn _mm_hsub_pi16() {
303+
let a = i16x4::new(1, 2, 3, 4);
304+
let b = i16x4::new(4, 128, 4, 3);
305+
let expected = i16x4::new(-1, -1, -124, 1);
306+
let r = ssse3::_mm_hsub_pi16(a, b);
307+
assert_eq!(r, expected);
308+
}
309+
310+
#[simd_test = "ssse3"]
311+
unsafe fn _mm_hsub_pi32() {
312+
let a = i32x2::new(1, 2);
313+
let b = i32x2::new(4, 128);
314+
let expected = i32x2::new(-1, -124);
315+
let r = ssse3::_mm_hsub_pi32(a, b);
316+
assert_eq!(r, expected);
317+
}
318+
319+
#[simd_test = "ssse3"]
320+
unsafe fn _mm_hsubs_pi16() {
321+
let a = i16x4::new(1, 2, 3, 4);
322+
let b = i16x4::new(4, 128, 4, 3);
323+
let expected = i16x4::new(-1, -1, -124, 1);
324+
let r = ssse3::_mm_hsubs_pi16(a, b);
325+
assert_eq!(r, expected);
326+
}
327+
328+
#[simd_test = "ssse3"]
329+
unsafe fn _mm_maddubs_pi16() {
330+
let a = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
331+
let b = i8x8::new(4, 63, 4, 3, 24, 12, 6, 19);
332+
let expected = i16x4::new(130, 24, 192, 194);
333+
let r = ssse3::_mm_maddubs_pi16(a, b);
334+
assert_eq!(r, expected);
335+
}
336+
337+
#[simd_test = "ssse3"]
338+
unsafe fn _mm_mulhrs_pi16() {
339+
let a = i16x4::new(1, 2, 3, 4);
340+
let b = i16x4::new(4, 32767, -1, -32768);
341+
let expected = i16x4::new(0, 2, 0, -4);
342+
let r = ssse3::_mm_mulhrs_pi16(a, b);
343+
assert_eq!(r, expected);
344+
}
345+
346+
#[simd_test = "ssse3"]
347+
unsafe fn _mm_sign_pi8() {
348+
let a = i8x8::new(1, 2, 3, 4, -5, -6, 7, 8);
349+
let b = i8x8::new(4, 64, 0, 3, 1, -1, -2, 1);
350+
let expected = i8x8::new(1, 2, 0, 4, -5, 6, -7, 8);
351+
let r = ssse3::_mm_sign_pi8(a, b);
352+
assert_eq!(r, expected);
353+
}
354+
355+
#[simd_test = "ssse3"]
356+
unsafe fn _mm_sign_pi16() {
357+
let a = i16x4::new(-1, 2, 3, 4);
358+
let b = i16x4::new(1, -1, 1, 0);
359+
let expected = i16x4::new(-1, -2, 3, 0);
360+
let r = ssse3::_mm_sign_pi16(a, b);
361+
assert_eq!(r, expected);
362+
}
363+
364+
#[simd_test = "ssse3"]
365+
unsafe fn _mm_sign_pi32() {
366+
let a = i32x2::new(-1, 2);
367+
let b = i32x2::new(1, 0);
368+
let expected = i32x2::new(-1, 0);
369+
let r = ssse3::_mm_sign_pi32(a, b);
370+
assert_eq!(r, expected);
371+
}
372+
}

0 commit comments

Comments
 (0)