Skip to content

Commit 6d2730a

Browse files
nominoloalexcrichton
authored andcommitted
Implement _mm_getcsr, _mm_setcsr, _mm_sfence (#88)
* Add _mm_sfence * Add _mm_getcsr/_mm_setcsr and convenience wrappers * Use test::black_box to simplify tests * Use uppercase naming for C-macro equivalents Discussed at #84
1 parent 3a19de8 commit 6d2730a

File tree

2 files changed

+322
-1
lines changed

2 files changed

+322
-1
lines changed

src/lib.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,11 +119,14 @@
119119
const_fn, link_llvm_intrinsics, platform_intrinsics, repr_simd, simd_ffi,
120120
target_feature, cfg_target_feature, i128_type, asm, const_atomic_usize_new
121121
)]
122-
#![cfg_attr(test, feature(proc_macro))]
122+
#![cfg_attr(test, feature(proc_macro, test))]
123123

124124
#[cfg(test)]
125125
extern crate stdsimd_test;
126126

127+
#[cfg(test)]
128+
extern crate test;
129+
127130
/// Platform independent SIMD vector types and operations.
128131
pub mod simd {
129132
pub use v128::*;

src/x86/sse.rs

Lines changed: 318 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,259 @@ pub unsafe fn _mm_movemask_ps(a: f32x4) -> i32 {
268268
movmskps(a)
269269
}
270270

271+
/// Perform a serializing operation on all store-to-memory instructions that
272+
/// were issued prior to this instruction.
273+
///
274+
/// Guarantees that every store instruction that precedes, in program order, is
275+
/// globally visible before any store instruction which follows the fence in
276+
/// program order.
277+
#[inline(always)]
278+
#[target_feature = "+sse"]
279+
#[cfg_attr(test, assert_instr(sfence))]
280+
pub unsafe fn _mm_sfence() {
281+
sfence()
282+
}
283+
284+
/// Get the unsigned 32-bit value of the MXCSR control and status register.
285+
///
286+
/// For more info see [`_mm_setcsr`](fn._mm_setcsr.html)
287+
#[inline(always)]
288+
#[target_feature = "+sse"]
289+
#[cfg_attr(test, assert_instr(stmxcsr))]
290+
pub unsafe fn _mm_getcsr() -> u32 {
291+
let mut result = 0i32;
292+
stmxcsr((&mut result) as *mut _ as *mut i8);
293+
result as u32
294+
}
295+
296+
/// Set the MXCSR register with the 32-bit unsigned integer value.
297+
///
298+
/// This register constrols how SIMD instructions handle floating point
299+
/// operations. Modifying this register only affects the current thread.
300+
///
301+
/// It contains several groups of flags:
302+
///
303+
/// * *Exception flags* report which exceptions occurred since last they were
304+
/// reset.
305+
///
306+
/// * *Masking flags* can be used to mask (ignore) certain exceptions. By default
307+
/// these flags are all set to 1, so all exceptions are masked. When an
308+
/// an exception is masked, the processor simply sets the exception flag and
309+
/// continues the operation. If the exception is unmasked, the flag is also set
310+
/// but additionally an exception handler is invoked.
311+
///
312+
/// * *Rounding mode flags* control the rounding mode of floating point
313+
/// instructions.
314+
///
315+
/// * The *denormals-are-zero mode flag* turns all numbers which would be
316+
/// denormalized (exponent bits are all zeros) into zeros.
317+
///
318+
/// ## Exception Flags
319+
///
320+
/// * `_MM_EXCEPT_INVALID`: An invalid operation was performed (e.g., dividing
321+
/// Infinity by Infinity).
322+
///
323+
/// * `_MM_EXCEPT_DENORM`: An operation attempted to operate on a denormalized
324+
/// number. Mainly this can cause loss of precision.
325+
///
326+
/// * `_MM_EXCEPT_DIV_ZERO`: Division by zero occured.
327+
///
328+
/// * `_MM_EXCEPT_OVERFLOW`: A numeric overflow exception occured, i.e., a
329+
/// result was too large to be represented (e.g., an `f32` with absolute value
330+
/// greater than `2^128`).
331+
///
332+
/// * `_MM_EXCEPT_UNDERFLOW`: A numeric underflow exception occured, i.e., a
333+
/// result was too small to be represented in a normalized way (e.g., an `f32`
334+
/// with absulte value smaller than `2^-126`.)
335+
///
336+
/// * `_MM_EXCEPT_INEXACT`: An inexact-result exception occured (a.k.a.
337+
/// precision exception). This means some precision was lost due to rounding.
338+
/// For example, the fraction `1/3` cannot be represented accurately in a
339+
/// 32 or 64 bit float and computing it would cause this exception to be
340+
/// raised. Precision exceptions are very common, so they are usually masked.
341+
///
342+
/// Exception flags can be read and set using the convenience functions
343+
/// `_MM_GET_EXCEPTION_STATE` and `_MM_SET_EXCEPTION_STATE`. For example, to
344+
/// check if an operation caused some overflow:
345+
///
346+
/// ```rust,ignore
347+
/// _MM_SET_EXCEPTION_STATE(0); // clear all exception flags
348+
/// // perform calculations
349+
/// if _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_OVERFLOW != 0 {
350+
/// // handle overflow
351+
/// }
352+
/// ```
353+
///
354+
/// ## Masking Flags
355+
///
356+
/// There is one masking flag for each exception flag: `_MM_MASK_INVALID`,
357+
/// `_MM_MASK_DENORM`, `_MM_MASK_DIV_ZERO`, `_MM_MASK_OVERFLOW`,
358+
/// `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`.
359+
///
360+
/// A single masking bit can be set via
361+
///
362+
/// ```rust,ignore
363+
/// _MM_SET_EXCEPTION_MASK(_MM_MASK_UNDERFLOW);
364+
/// ```
365+
///
366+
/// However, since mask bits are by default all set to 1, it is more common to
367+
/// want to *disable* certain bits. For example, to unmask the underflow
368+
/// exception, use:
369+
///
370+
/// ```rust,ignore
371+
/// _mm_setcsr(_mm_getcsr() & !_MM_MASK_UNDERFLOW); // unmask underflow exception
372+
/// ```
373+
///
374+
/// Warning: an unmasked exception will cause an exception handler to be called.
375+
/// The standard handler will simply terminate the process. So, in this case
376+
/// any underflow exception would terminate the current process with something
377+
/// like `signal: 8, SIGFPE: erroneous arithmetic operation`.
378+
///
379+
/// ## Rounding Mode
380+
///
381+
/// The rounding mode is describe using two bits. It can be read and set using
382+
/// the convenience wrappers `_MM_GET_ROUNDING_MODE()` and
383+
/// `_MM_SET_ROUNDING_MODE(mode)`.
384+
///
385+
/// The rounding modes are:
386+
///
387+
/// * `_MM_ROUND_NEAREST`: (default) Round to closest to the infinite precision
388+
/// value. If two values are equally close, round to even (i.e., least
389+
/// significant bit will be zero).
390+
///
391+
/// * `_MM_ROUND_DOWN`: Round toward negative Infinity.
392+
///
393+
/// * `_MM_ROUND_UP`: Round toward positive Infinity.
394+
///
395+
/// * `_MM_ROUND_TOWARD_ZERO`: Round towards zero (truncate).
396+
///
397+
/// Example:
398+
///
399+
/// ```rust,ignore
400+
/// _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN)
401+
/// ```
402+
///
403+
/// ## Denormals-are-zero/Flush-to-zero Mode
404+
///
405+
/// If this bit is set, values that would be denormalized will be set to zero
406+
/// instead. This is turned off by default.
407+
///
408+
/// You can read and enable/disable this mode via the helper functions
409+
/// `_MM_GET_FLUSH_ZERO_MODE()` and `_MM_SET_FLUSH_ZERO_MODE()`:
410+
///
411+
/// ```rust,ignore
412+
/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); // turn off (default)
413+
/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // turn on
414+
/// ```
415+
///
416+
#[inline(always)]
417+
#[target_feature = "+sse"]
418+
#[cfg_attr(test, assert_instr(ldmxcsr))]
419+
pub unsafe fn _mm_setcsr(val: u32) {
420+
ldmxcsr(&val as *const _ as *const i8);
421+
}
422+
423+
/// See [`_mm_setcsr`](fn._mm_setcsr.html)
424+
pub const _MM_EXCEPT_INVALID: u32 = 0x0001;
425+
/// See [`_mm_setcsr`](fn._mm_setcsr.html)
426+
pub const _MM_EXCEPT_DENORM: u32 = 0x0002;
427+
/// See [`_mm_setcsr`](fn._mm_setcsr.html)
428+
pub const _MM_EXCEPT_DIV_ZERO: u32 = 0x0004;
429+
/// See [`_mm_setcsr`](fn._mm_setcsr.html)
430+
pub const _MM_EXCEPT_OVERFLOW: u32 = 0x0008;
431+
/// See [`_mm_setcsr`](fn._mm_setcsr.html)
432+
pub const _MM_EXCEPT_UNDERFLOW: u32 = 0x0010;
433+
/// See [`_mm_setcsr`](fn._mm_setcsr.html)
434+
pub const _MM_EXCEPT_INEXACT: u32 = 0x0020;
435+
pub const _MM_EXCEPT_MASK: u32 = 0x003f;
436+
437+
/// See [`_mm_setcsr`](fn._mm_setcsr.html)
438+
pub const _MM_MASK_INVALID: u32 = 0x0080;
439+
/// See [`_mm_setcsr`](fn._mm_setcsr.html)
440+
pub const _MM_MASK_DENORM: u32 = 0x0100;
441+
/// See [`_mm_setcsr`](fn._mm_setcsr.html)
442+
pub const _MM_MASK_DIV_ZERO: u32 = 0x0200;
443+
/// See [`_mm_setcsr`](fn._mm_setcsr.html)
444+
pub const _MM_MASK_OVERFLOW: u32 = 0x0400;
445+
/// See [`_mm_setcsr`](fn._mm_setcsr.html)
446+
pub const _MM_MASK_UNDERFLOW: u32 = 0x0800;
447+
/// See [`_mm_setcsr`](fn._mm_setcsr.html)
448+
pub const _MM_MASK_INEXACT: u32 = 0x1000;
449+
pub const _MM_MASK_MASK: u32 = 0x1f80;
450+
451+
/// See [`_mm_setcsr`](fn._mm_setcsr.html)
452+
pub const _MM_ROUND_NEAREST: u32 = 0x0000;
453+
/// See [`_mm_setcsr`](fn._mm_setcsr.html)
454+
pub const _MM_ROUND_DOWN: u32 = 0x2000;
455+
/// See [`_mm_setcsr`](fn._mm_setcsr.html)
456+
pub const _MM_ROUND_UP: u32 = 0x4000;
457+
/// See [`_mm_setcsr`](fn._mm_setcsr.html)
458+
pub const _MM_ROUND_TOWARD_ZERO: u32 = 0x6000;
459+
pub const _MM_ROUND_MASK: u32 = 0x6000;
460+
461+
pub const _MM_FLUSH_ZERO_MASK: u32 = 0x8000;
462+
/// See [`_mm_setcsr`](fn._mm_setcsr.html)
463+
pub const _MM_FLUSH_ZERO_ON: u32 = 0x8000;
464+
/// See [`_mm_setcsr`](fn._mm_setcsr.html)
465+
pub const _MM_FLUSH_ZERO_OFF: u32 = 0x0000;
466+
467+
#[inline(always)]
468+
#[allow(non_snake_case)]
469+
#[target_feature = "+sse"]
470+
pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 {
471+
_mm_getcsr() & _MM_MASK_MASK
472+
}
473+
474+
#[inline(always)]
475+
#[allow(non_snake_case)]
476+
#[target_feature = "+sse"]
477+
pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 {
478+
_mm_getcsr() & _MM_EXCEPT_MASK
479+
}
480+
481+
#[inline(always)]
482+
#[allow(non_snake_case)]
483+
#[target_feature = "+sse"]
484+
pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 {
485+
_mm_getcsr() & _MM_FLUSH_ZERO_MASK
486+
}
487+
488+
#[inline(always)]
489+
#[allow(non_snake_case)]
490+
#[target_feature = "+sse"]
491+
pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 {
492+
_mm_getcsr() & _MM_ROUND_MASK
493+
}
494+
495+
#[inline(always)]
496+
#[allow(non_snake_case)]
497+
#[target_feature = "+sse"]
498+
pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) {
499+
_mm_setcsr((_mm_getcsr() & !_MM_MASK_MASK) | x)
500+
}
501+
502+
#[inline(always)]
503+
#[allow(non_snake_case)]
504+
#[target_feature = "+sse"]
505+
pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) {
506+
_mm_setcsr((_mm_getcsr() & !_MM_EXCEPT_MASK) | x)
507+
}
508+
509+
#[inline(always)]
510+
#[allow(non_snake_case)]
511+
#[target_feature = "+sse"]
512+
pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) {
513+
let val = (_mm_getcsr() & !_MM_FLUSH_ZERO_MASK) | x;
514+
//println!("setting csr={:x}", val);
515+
_mm_setcsr(val)
516+
}
517+
518+
#[inline(always)]
519+
#[allow(non_snake_case)]
520+
#[target_feature = "+sse"]
521+
pub unsafe fn _MM_SET_ROUNDING_MODE(x: u32) {
522+
_mm_setcsr((_mm_getcsr() & !_MM_ROUND_MASK) | x)
523+
}
271524

272525
/// See [`_mm_prefetch`](fn._mm_prefetch.html).
273526
pub const _MM_HINT_T0: i8 = 3;
@@ -374,6 +627,12 @@ extern {
374627
fn maxps(a: f32x4, b: f32x4) -> f32x4;
375628
#[link_name = "llvm.x86.sse.movmsk.ps"]
376629
fn movmskps(a: f32x4) -> i32;
630+
#[link_name = "llvm.x86.sse.sfence"]
631+
fn sfence();
632+
#[link_name = "llvm.x86.sse.stmxcsr"]
633+
fn stmxcsr(p: *mut i8);
634+
#[link_name = "llvm.x86.sse.ldmxcsr"]
635+
fn ldmxcsr(p: *const i8);
377636
#[link_name = "llvm.prefetch"]
378637
fn prefetch(p: *const c_void, rw: i32, loc: i32, ty: i32);
379638
}
@@ -383,6 +642,7 @@ mod tests {
383642
use v128::*;
384643
use x86::sse;
385644
use stdsimd_test::simd_test;
645+
use test::black_box; // Used to inhibit constant-folding.
386646

387647
#[simd_test = "sse"]
388648
unsafe fn _mm_add_ps() {
@@ -577,4 +837,62 @@ mod tests {
577837
let r = sse::_mm_movemask_ps(f32x4::new(-1.0, -5.0, -5.0, 0.0));
578838
assert_eq!(r, 0b0111);
579839
}
840+
841+
#[simd_test = "sse"]
842+
unsafe fn _mm_sfence() {
843+
sse::_mm_sfence();
844+
}
845+
846+
#[simd_test = "sse"]
847+
unsafe fn _mm_getcsr_setcsr_1() {
848+
let saved_csr = sse::_mm_getcsr();
849+
850+
let a = f32x4::new(1.1e-36, 0.0, 0.0, 1.0);
851+
let b = f32x4::new(0.001, 0.0, 0.0, 1.0);
852+
853+
sse::_MM_SET_FLUSH_ZERO_MODE(sse::_MM_FLUSH_ZERO_ON);
854+
let r = sse::_mm_mul_ps(black_box(a), black_box(b));
855+
856+
sse::_mm_setcsr(saved_csr);
857+
858+
let exp = f32x4::new(0.0, 0.0, 0.0, 1.0);
859+
assert_eq!(r, exp); // first component is a denormalized f32
860+
}
861+
862+
#[simd_test = "sse"]
863+
unsafe fn _mm_getcsr_setcsr_2() {
864+
// Same as _mm_setcsr_1 test, but with opposite flag value.
865+
866+
let saved_csr = sse::_mm_getcsr();
867+
868+
let a = f32x4::new(1.1e-36, 0.0, 0.0, 1.0);
869+
let b = f32x4::new(0.001, 0.0, 0.0, 1.0);
870+
871+
sse::_MM_SET_FLUSH_ZERO_MODE(sse::_MM_FLUSH_ZERO_OFF);
872+
let r = sse::_mm_mul_ps(black_box(a), black_box(b));
873+
874+
sse::_mm_setcsr(saved_csr);
875+
876+
let exp = f32x4::new(1.1e-39, 0.0, 0.0, 1.0);
877+
assert_eq!(r, exp); // first component is a denormalized f32
878+
}
879+
880+
#[simd_test = "sse"]
881+
unsafe fn _mm_getcsr_setcsr_underflow() {
882+
sse::_MM_SET_EXCEPTION_STATE(0);
883+
884+
let a = f32x4::new(1.1e-36, 0.0, 0.0, 1.0);
885+
let b = f32x4::new(1e-5, 0.0, 0.0, 1.0);
886+
887+
assert_eq!(sse::_MM_GET_EXCEPTION_STATE(), 0); // just to be sure
888+
889+
let r = sse::_mm_mul_ps(black_box(a), black_box(b));
890+
891+
let exp = f32x4::new(1.1e-41, 0.0, 0.0, 1.0);
892+
assert_eq!(r, exp);
893+
894+
let underflow =
895+
sse::_MM_GET_EXCEPTION_STATE() & sse::_MM_EXCEPT_UNDERFLOW != 0;
896+
assert_eq!(underflow, true);
897+
}
580898
}

0 commit comments

Comments
 (0)