add mmx module, mmx run-time detection, intrinsics (#220)

gnzlbg · alexcrichton · commit 02ecd726dd3e · 2017-11-28T07:45:41.000-08:00
* [sse] _mm_cvtps_pi32, _mm_cvt_ps2pi

* [mmx] run-time detection support

* [x86] add mmx module

* [x86] make __m64 public

* [sse] add _mm_cvtps_pi{8,16}, _mm_cvttps_pi32, _mm_cvtt_ps2pi

* move new intrinsics from i586 to i686 module

* mmx requires i686
diff --git a/coresimd/src/runtime/x86.rs b/coresimd/src/runtime/x86.rs
@@ -29,6 +29,9 @@ use super::bit;
 #[macro_export]
 #[doc(hidden)]
 macro_rules! __unstable_detect_feature {
+    ("mmx") => {
+        $crate::vendor::__unstable_detect_feature(
+            $crate::vendor::__Feature::mmx{})  };
     ("sse") => {
         $crate::vendor::__unstable_detect_feature(
             $crate::vendor::__Feature::sse{})  };
@@ -165,6 +168,8 @@ macro_rules! __unstable_detect_feature {
 #[allow(non_camel_case_types)]
 #[repr(u8)]
 pub enum __Feature {
+    /// MMX
+    mmx,
     /// SSE (Streaming SIMD Extensions)
     sse,
     /// SSE2 (Streaming SIMD Extensions 2)
@@ -332,6 +337,7 @@ pub fn detect_features() -> usize {
         enable(proc_info_ecx, 20, __Feature::sse4_2);
         enable(proc_info_ecx, 23, __Feature::popcnt);
         enable(proc_info_edx, 24, __Feature::fxsr);
+        enable(proc_info_edx, 23, __Feature::mmx);
         enable(proc_info_edx, 25, __Feature::sse);
         enable(proc_info_edx, 26, __Feature::sse2);
 
diff --git a/coresimd/src/x86/i586/sse.rs b/coresimd/src/x86/i586/sse.rs
@@ -626,10 +626,6 @@ pub unsafe fn _mm_cvt_ss2si(a: f32x4) -> i32 {
     _mm_cvtss_si32(a)
 }
 
-// Blocked by https://github.com/rust-lang-nursery/stdsimd/issues/74
-// pub unsafe fn _mm_cvtps_pi32(a: f32x4) -> i32x2
-// pub unsafe fn _mm_cvt_ps2pi(a: f32x4) -> i32x2 { _mm_cvtps_pi32(a) }
-
 /// Convert the lowest 32 bit float in the input vector to a 32 bit integer
 /// with
 /// truncation.
@@ -655,10 +651,6 @@ pub unsafe fn _mm_cvtt_ss2si(a: f32x4) -> i32 {
     _mm_cvttss_si32(a)
 }
 
-// Blocked by https://github.com/rust-lang-nursery/stdsimd/issues/74
-// pub unsafe fn _mm_cvttps_pi32(a: f32x4) -> i32x2;
-// pub unsafe fn _mm_cvtt_ps2pi(a: f32x4) -> i32x2 { _mm_cvttps_pi32(a) }
-
 /// Extract the lowest 32 bit float from the input vector.
 #[inline(always)]
 #[target_feature = "+sse"]
diff --git a/coresimd/src/x86/i686/mmx.rs b/coresimd/src/x86/i686/mmx.rs
@@ -0,0 +1,88 @@
+//! `i586` MMX instruction set.
+//!
+//! The intrinsics here roughly correspond to those in the `mmintrin.h` C
+//! header.
+//!
+//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
+//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+
+use v64::{i16x4, i32x2, i8x8};
+use x86::__m64;
+use core::mem;
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Constructs a 64-bit integer vector initialized to zero.
+#[inline(always)]
+#[target_feature = "+mmx,+sse"]
+// FIXME: this produces a movl instead of xorps on x86
+// FIXME: this produces a xor intrinsic instead of xorps on x86_64
+#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(xor))]
+pub unsafe fn _mm_setzero_si64() -> __m64 {
+    mem::transmute(0_i64)
+}
+
+/// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// using signed saturation.
+///
+/// Positive values greater than 0x7F are saturated to 0x7F. Negative values
+/// less than 0x80 are saturated to 0x80.
+#[inline(always)]
+#[target_feature = "+mmx,+sse"]
+#[cfg_attr(test, assert_instr(packsswb))]
+pub unsafe fn _mm_packs_pi16(a: i16x4, b: i16x4) -> i8x8 {
+    mem::transmute(packsswb(mem::transmute(a), mem::transmute(b)))
+}
+
+/// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers
+/// using signed saturation.
+///
+/// Positive values greater than 0x7F are saturated to 0x7F. Negative values
+/// less than 0x80 are saturated to 0x80.
+#[inline(always)]
+#[target_feature = "+mmx,+sse"]
+#[cfg_attr(test, assert_instr(packssdw))]
+pub unsafe fn _mm_packs_pi32(a: i32x2, b: i32x2) -> i16x4 {
+    mem::transmute(packssdw(mem::transmute(a), mem::transmute(b)))
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.mmx.packsswb"]
+    fn packsswb(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.packssdw"]
+    fn packssdw(a: __m64, b: __m64) -> __m64;
+}
+
+#[cfg(test)]
+mod tests {
+    use v64::{i16x4, i32x2, i8x8};
+    use x86::i686::mmx;
+    use x86::__m64;
+    use stdsimd_test::simd_test;
+
+    #[simd_test = "sse"] // FIXME: should be mmx
+    unsafe fn _mm_setzero_si64() {
+        let r: __m64 = ::std::mem::transmute(0_i64);
+        assert_eq!(r, mmx::_mm_setzero_si64());
+    }
+
+    #[simd_test = "sse"] // FIXME: should be mmx
+    unsafe fn _mm_packs_pi16() {
+        let a = i16x4::new(-1, 2, -3, 4);
+        let b = i16x4::new(-5, 6, -7, 8);
+        let r = i8x8::new(-1, 2, -3, 4, -5, 6, -7, 8);
+        assert_eq!(r, mmx::_mm_packs_pi16(a, b));
+    }
+
+    #[simd_test = "sse"] // FIXME: should be mmx
+    unsafe fn _mm_packs_pi32() {
+        let a = i32x2::new(-1, 2);
+        let b = i32x2::new(-5, 6);
+        let r = i16x4::new(-1, 2, -5, 6);
+        assert_eq!(r, mmx::_mm_packs_pi32(a, b));
+    }
+}
diff --git a/coresimd/src/x86/i686/mod.rs b/coresimd/src/x86/i686/mod.rs
@@ -1,5 +1,8 @@
 //! `i686` intrinsics
 
+mod mmx;
+pub use self::mmx::*;
+
 mod sse;
 pub use self::sse::*;
 
diff --git a/coresimd/src/x86/i686/sse.rs b/coresimd/src/x86/i686/sse.rs
@@ -1,17 +1,15 @@
 //! `i686` Streaming SIMD Extensions (SSE)
 
-use v64::{i16x4, u8x8};
+use v128::f32x4;
+use v64::{i16x4, i32x2, i8x8, u8x8};
+use x86::__m64;
 use core::mem;
+use x86::i586;
+use x86::i686::mmx;
 
 #[cfg(test)]
 use stdsimd_test::assert_instr;
 
-/// This type is only required for mapping vector types to llvm's `x86_mmx`
-/// type.
-#[allow(non_camel_case_types)]
-#[repr(simd)]
-struct __m64(i64);
-
 #[allow(improper_ctypes)]
 extern "C" {
     #[link_name = "llvm.x86.mmx.pmaxs.w"]
@@ -22,6 +20,10 @@ extern "C" {
     fn pminsw(a: __m64, b: __m64) -> __m64;
     #[link_name = "llvm.x86.mmx.pminu.b"]
     fn pminub(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.sse.cvtps2pi"]
+    fn cvtps2pi(a: f32x4) -> __m64;
+    #[link_name = "llvm.x86.sse.cvttps2pi"]
+    fn cvttps2pi(a: f32x4) -> __m64;
 }
 
 /// Compares the packed 16-bit signed integers of `a` and `b` writing the
@@ -96,9 +98,70 @@ pub unsafe fn _m_pminub(a: u8x8, b: u8x8) -> u8x8 {
     _mm_min_pu8(a, b)
 }
 
+/// Convert the two lower packed single-precision (32-bit) floating-point
+/// elements in `a` to packed 32-bit integers with truncation.
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(cvttps2pi))]
+pub unsafe fn _mm_cvttps_pi32(a: f32x4) -> i32x2 {
+    mem::transmute(cvttps2pi(a))
+}
+
+/// Convert the two lower packed single-precision (32-bit) floating-point
+/// elements in `a` to packed 32-bit integers with truncation.
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(cvttps2pi))]
+pub unsafe fn _mm_cvtt_ps2pi(a: f32x4) -> i32x2 {
+    _mm_cvttps_pi32(a)
+}
+
+/// Convert the two lower packed single-precision (32-bit) floating-point
+/// elements in `a` to packed 32-bit integers.
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(cvtps2pi))]
+pub unsafe fn _mm_cvtps_pi32(a: f32x4) -> i32x2 {
+    mem::transmute(cvtps2pi(a))
+}
+
+/// Convert the two lower packed single-precision (32-bit) floating-point
+/// elements in `a` to packed 32-bit integers.
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(cvtps2pi))]
+pub unsafe fn _mm_cvt_ps2pi(a: f32x4) -> i32x2 {
+    _mm_cvtps_pi32(a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to
+/// packed 16-bit integers.
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(cvtps2pi))]
+pub unsafe fn _mm_cvtps_pi16(a: f32x4) -> i16x4 {
+    let b = _mm_cvtps_pi32(a);
+    let a = i586::_mm_movehl_ps(a, a);
+    let c = _mm_cvtps_pi32(a);
+    mmx::_mm_packs_pi32(b, c)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to
+/// packed 8-bit integers, and returns theem in the lower 4 elements of the
+/// result.
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(cvtps2pi))]
+pub unsafe fn _mm_cvtps_pi8(a: f32x4) -> i8x8 {
+    let b = _mm_cvtps_pi16(a);
+    let c = mmx::_mm_setzero_si64();
+    mmx::_mm_packs_pi16(b, mem::transmute(c))
+}
+
 #[cfg(test)]
 mod tests {
-    use v64::{i16x4, u8x8};
+    use v128::f32x4;
+    use v64::{i16x4, i32x2, i8x8, u8x8};
     use x86::i686::sse;
     use stdsimd_test::simd_test;
 
@@ -141,4 +204,36 @@ mod tests {
         assert_eq!(r, sse::_mm_min_pu8(a, b));
         assert_eq!(r, sse::_m_pminub(a, b));
     }
+
+    #[simd_test = "sse"]
+    unsafe fn _mm_cvtps_pi32() {
+        let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
+        let r = i32x2::new(1, 2);
+
+        assert_eq!(r, sse::_mm_cvtps_pi32(a));
+        assert_eq!(r, sse::_mm_cvt_ps2pi(a));
+    }
+
+    #[simd_test = "sse"]
+    unsafe fn _mm_cvttps_pi32() {
+        let a = f32x4::new(7.0, 2.0, 3.0, 4.0);
+        let r = i32x2::new(7, 2);
+
+        assert_eq!(r, sse::_mm_cvttps_pi32(a));
+        assert_eq!(r, sse::_mm_cvtt_ps2pi(a));
+    }
+
+    #[simd_test = "sse"]
+    unsafe fn _mm_cvtps_pi16() {
+        let a = f32x4::new(7.0, 2.0, 3.0, 4.0);
+        let r = i16x4::new(7, 2, 3, 4);
+        assert_eq!(r, sse::_mm_cvtps_pi16(a));
+    }
+
+    #[simd_test = "sse"]
+    unsafe fn _mm_cvtps_pi8() {
+        let a = f32x4::new(7.0, 2.0, 3.0, 4.0);
+        let r = i8x8::new(7, 2, 3, 4, 0, 0, 0, 0);
+        assert_eq!(r, sse::_mm_cvtps_pi8(a));
+    }
 }
diff --git a/coresimd/src/x86/mod.rs b/coresimd/src/x86/mod.rs
@@ -26,6 +26,11 @@ mod x86_64;
 #[cfg(target_arch = "x86_64")]
 pub use self::x86_64::*;
 
+/// 64-bit wide integer vector type.
+#[allow(non_camel_case_types)]
+#[repr(simd)]
+#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
+pub struct __m64(i64); // corresponds to llvm's `x86_mmx` type
 /// 128-bit wide signed integer vector type
 #[allow(non_camel_case_types)]
 pub type __m128i = ::v128::i8x16;