Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions coresimd/src/runtime/x86.rs
Original file line number Diff line number Diff line change
Expand Up @@ -302,8 +302,7 @@ pub fn detect_features() -> usize {
// Contains information about bmi,bmi2, and avx2 support.
let (extended_features_ebx, extended_features_ecx) = if max_basic_leaf >= 7
{
let CpuidResult { ebx, ecx, .. } =
unsafe { __cpuid(0x0000_0007_u32) };
let CpuidResult { ebx, ecx, .. } = unsafe { __cpuid(0x0000_0007_u32) };
(ebx, ecx)
} else {
(0, 0) // CPUID does not support "Extended Features"
Expand All @@ -320,8 +319,7 @@ pub fn detect_features() -> usize {
// EAX = 0x8000_0001, ECX=0: Queries "Extended Processor Info and Feature
// Bits"
let extended_proc_info_ecx = if extended_max_basic_leaf >= 1 {
let CpuidResult { ecx, .. } =
unsafe { __cpuid(0x8000_0001_u32) };
let CpuidResult { ecx, .. } = unsafe { __cpuid(0x8000_0001_u32) };
ecx
} else {
0
Expand Down Expand Up @@ -457,6 +455,7 @@ mod tests {
println!("ssse3: {:?}", cfg_feature_enabled!("ssse3"));
println!("sse4.1: {:?}", cfg_feature_enabled!("sse4.1"));
println!("sse4.2: {:?}", cfg_feature_enabled!("sse4.2"));
println!("sse4a: {:?}", cfg_feature_enabled!("sse4a"));
println!("avx: {:?}", cfg_feature_enabled!("avx"));
println!("avx2: {:?}", cfg_feature_enabled!("avx2"));
println!("avx512f {:?}", cfg_feature_enabled!("avx512f"));
Expand Down Expand Up @@ -495,6 +494,7 @@ mod tests {
assert_eq!(cfg_feature_enabled!("ssse3"), information.ssse3());
assert_eq!(cfg_feature_enabled!("sse4.1"), information.sse4_1());
assert_eq!(cfg_feature_enabled!("sse4.2"), information.sse4_2());
assert_eq!(cfg_feature_enabled!("sse4a"), information.sse4a());
assert_eq!(cfg_feature_enabled!("avx"), information.avx());
assert_eq!(cfg_feature_enabled!("avx2"), information.avx2());
assert_eq!(cfg_feature_enabled!("avx512f"), information.avx512f());
Expand All @@ -520,7 +520,6 @@ mod tests {
assert_eq!(cfg_feature_enabled!("bmi"), information.bmi1());
assert_eq!(cfg_feature_enabled!("bmi2"), information.bmi2());
assert_eq!(cfg_feature_enabled!("popcnt"), information.popcnt());
assert_eq!(cfg_feature_enabled!("sse4a"), information.sse4a());
assert_eq!(cfg_feature_enabled!("abm"), information.lzcnt());
assert_eq!(cfg_feature_enabled!("tbm"), information.tbm());
assert_eq!(cfg_feature_enabled!("lzcnt"), information.lzcnt());
Expand Down
1 change: 1 addition & 0 deletions coresimd/src/x86/i586/tbm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#[cfg(test)]
use stdsimd_test::assert_instr;

// FIXME(blocked on #248)
// TODO: LLVM-CODEGEN ERROR: LLVM ERROR: Cannot select:
// intrinsic %llvm.x86.tbm.bextri.u32
/*
Expand Down
5 changes: 5 additions & 0 deletions coresimd/src/x86/i686/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,8 @@ pub use self::sse41::*;

mod sse42;
pub use self::sse42::*;

#[cfg(not(feature = "intel_sde"))]
mod sse4a;
#[cfg(not(feature = "intel_sde"))]
pub use self::sse4a::*;
155 changes: 155 additions & 0 deletions coresimd/src/x86/i686/sse4a.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
//! `i686`'s Streaming SIMD Extensions 4a (SSE4a)

use core::mem;
use v128::*;

#[cfg(test)]
use stdsimd_test::assert_instr;

#[allow(improper_ctypes)]
extern "C" {
#[link_name = "llvm.x86.sse4a.extrq"]
fn extrq(x: i64x2, y: i8x16) -> i64x2;
#[link_name = "llvm.x86.sse4a.insertq"]
fn insertq(x: i64x2, y: i64x2) -> i64x2;
#[link_name = "llvm.x86.sse4a.movnt.sd"]
fn movntsd(x: *mut f64, y: f64x2);
#[link_name = "llvm.x86.sse4a.movnt.ss"]
fn movntss(x: *mut f32, y: f32x4);
}

// FIXME(blocked on #248): _mm_extracti_si64(x, len, idx) // EXTRQ
// FIXME(blocked on #248): _mm_inserti_si64(x, y, len, idx) // INSERTQ

/// Extracts the bit range specified by `y` from the lower 64 bits of `x`.
///
/// The [13:8] bits of `y` specify the index of the bit-range to extract. The
/// [5:0] bits of `y` specify the length of the bit-range to extract. All other
/// bits are ignored.
///
/// If the length is zero, it is interpreted as `64`. If the length and index
/// are zero, the lower 64 bits of `x` are extracted.
///
/// If `length == 0 && index > 0` or `lenght + index > 64` the result is
/// undefined.
#[inline(always)]
#[target_feature = "+sse4a"]
#[cfg_attr(test, assert_instr(extrq))]
pub unsafe fn _mm_extract_si64(x: i64x2, y: i64x2) -> i64x2 {
extrq(x, mem::transmute(y))
}

/// Inserts the `[length:0]` bits of `y` into `x` at `index`.
///
/// The bits of `y`:
///
/// - `[69:64]` specify the `length`,
/// - `[77:72]` specify the index.
///
/// If the `length` is zero it is interpreted as `64`. If `index + length > 64`
/// or `index > 0 && length == 0` the result is undefined.
#[inline(always)]
#[target_feature = "+sse4a"]
#[cfg_attr(test, assert_instr(insertq))]
pub unsafe fn _mm_insert_si64(x: i64x2, y: i64x2) -> i64x2 {
insertq(x, mem::transmute(y))
}

/// Non-temporal store of `a.1` into `p`.
#[inline(always)]
#[target_feature = "+sse4a"]
#[cfg_attr(test, assert_instr(movntsd))]
pub unsafe fn _mm_stream_sd(p: *mut f64, a: f64x2) {
movntsd(p, a);
}

/// Non-temporal store of `a.3` into `p`.
#[inline(always)]
#[target_feature = "+sse4a"]
#[cfg_attr(test, assert_instr(movntss))]
pub unsafe fn _mm_stream_ss(p: *mut f32, a: f32x4) {
movntss(p, a);
}

#[cfg(test)]
mod tests {
use stdsimd_test::simd_test;
use x86::i686::sse4a;
use v128::*;

#[simd_test = "sse4a"]
unsafe fn _mm_extract_si64() {
let b = 0b0110_0000_0000_i64;
// ^^^^ bit range extracted
let x = i64x2::new(b, 0);
let v = 0b001000___00___000100_i64;
// ^idx: 2^3 = 8 ^length = 2^2 = 4
let y = i64x2::new(v, 0);
let e = i64x2::new(0b0110_i64, 0);
let r = sse4a::_mm_extract_si64(x, y);
assert_eq!(r, e);
}

#[simd_test = "sse4a"]
unsafe fn _mm_insert_si64() {
let i = 0b0110_i64;
// ^^^^ bit range inserted
let z = 0b1010_1010_1010i64;
// ^^^^ bit range replaced
let e = 0b0110_1010_1010i64;
// ^^^^ replaced 1010 with 0110
let x = i64x2::new(z, 0);
let expected = i64x2::new(e, 0);
let v = 0b001000___00___000100_i64;
// ^idx: 2^3 = 8 ^length = 2^2 = 4
let y = i64x2::new(i, v);
let r = sse4a::_mm_insert_si64(x, y);
assert_eq!(r, expected);
}

#[repr(align(16))]
struct MemoryF64 {
data: [f64; 2],
}

#[simd_test = "sse4a"]
unsafe fn _mm_stream_sd() {
let mut mem = MemoryF64 {
data: [1.0_f64, 2.0],
};
{
let vals = &mut mem.data;
let d = vals.as_mut_ptr();

let x = f64x2::new(3.0, 4.0);

sse4a::_mm_stream_sd(d, x);
}
assert_eq!(mem.data[0], 4.0);
assert_eq!(mem.data[1], 2.0);
}

#[repr(align(16))]
struct MemoryF32 {
data: [f32; 4],
}

#[simd_test = "sse4a"]
unsafe fn _mm_stream_ss() {
let mut mem = MemoryF32 {
data: [1.0_f32, 2.0, 3.0, 4.0],
};
{
let vals = &mut mem.data;
let d = vals.as_mut_ptr();

let x = f32x4::new(5.0, 6.0, 7.0, 8.0);

sse4a::_mm_stream_ss(d, x);
}
assert_eq!(mem.data[0], 8.0);
assert_eq!(mem.data[1], 2.0);
assert_eq!(mem.data[2], 3.0);
assert_eq!(mem.data[3], 4.0);
}
}