@@ -268,6 +268,259 @@ pub unsafe fn _mm_movemask_ps(a: f32x4) -> i32 {
268
268
movmskps ( a)
269
269
}
270
270
271
+ /// Perform a serializing operation on all store-to-memory instructions that
272
+ /// were issued prior to this instruction.
273
+ ///
274
+ /// Guarantees that every store instruction that precedes, in program order, is
275
+ /// globally visible before any store instruction which follows the fence in
276
+ /// program order.
277
+ #[ inline( always) ]
278
+ #[ target_feature = "+sse" ]
279
+ #[ cfg_attr( test, assert_instr( sfence) ) ]
280
+ pub unsafe fn _mm_sfence ( ) {
281
+ sfence ( )
282
+ }
283
+
284
+ /// Get the unsigned 32-bit value of the MXCSR control and status register.
285
+ ///
286
+ /// For more info see [`_mm_setcsr`](fn._mm_setcsr.html)
287
+ #[ inline( always) ]
288
+ #[ target_feature = "+sse" ]
289
+ #[ cfg_attr( test, assert_instr( stmxcsr) ) ]
290
+ pub unsafe fn _mm_getcsr ( ) -> u32 {
291
+ let mut result = 0i32 ;
292
+ stmxcsr ( ( & mut result) as * mut _ as * mut i8 ) ;
293
+ result as u32
294
+ }
295
+
296
+ /// Set the MXCSR register with the 32-bit unsigned integer value.
297
+ ///
298
+ /// This register constrols how SIMD instructions handle floating point
299
+ /// operations. Modifying this register only affects the current thread.
300
+ ///
301
+ /// It contains several groups of flags:
302
+ ///
303
+ /// * *Exception flags* report which exceptions occurred since last they were
304
+ /// reset.
305
+ ///
306
+ /// * *Masking flags* can be used to mask (ignore) certain exceptions. By default
307
+ /// these flags are all set to 1, so all exceptions are masked. When an
308
+ /// an exception is masked, the processor simply sets the exception flag and
309
+ /// continues the operation. If the exception is unmasked, the flag is also set
310
+ /// but additionally an exception handler is invoked.
311
+ ///
312
+ /// * *Rounding mode flags* control the rounding mode of floating point
313
+ /// instructions.
314
+ ///
315
+ /// * The *denormals-are-zero mode flag* turns all numbers which would be
316
+ /// denormalized (exponent bits are all zeros) into zeros.
317
+ ///
318
+ /// ## Exception Flags
319
+ ///
320
+ /// * `_MM_EXCEPT_INVALID`: An invalid operation was performed (e.g., dividing
321
+ /// Infinity by Infinity).
322
+ ///
323
+ /// * `_MM_EXCEPT_DENORM`: An operation attempted to operate on a denormalized
324
+ /// number. Mainly this can cause loss of precision.
325
+ ///
326
+ /// * `_MM_EXCEPT_DIV_ZERO`: Division by zero occured.
327
+ ///
328
+ /// * `_MM_EXCEPT_OVERFLOW`: A numeric overflow exception occured, i.e., a
329
+ /// result was too large to be represented (e.g., an `f32` with absolute value
330
+ /// greater than `2^128`).
331
+ ///
332
+ /// * `_MM_EXCEPT_UNDERFLOW`: A numeric underflow exception occured, i.e., a
333
+ /// result was too small to be represented in a normalized way (e.g., an `f32`
334
+ /// with absulte value smaller than `2^-126`.)
335
+ ///
336
+ /// * `_MM_EXCEPT_INEXACT`: An inexact-result exception occured (a.k.a.
337
+ /// precision exception). This means some precision was lost due to rounding.
338
+ /// For example, the fraction `1/3` cannot be represented accurately in a
339
+ /// 32 or 64 bit float and computing it would cause this exception to be
340
+ /// raised. Precision exceptions are very common, so they are usually masked.
341
+ ///
342
+ /// Exception flags can be read and set using the convenience functions
343
+ /// `_MM_GET_EXCEPTION_STATE` and `_MM_SET_EXCEPTION_STATE`. For example, to
344
+ /// check if an operation caused some overflow:
345
+ ///
346
+ /// ```rust,ignore
347
+ /// _MM_SET_EXCEPTION_STATE(0); // clear all exception flags
348
+ /// // perform calculations
349
+ /// if _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_OVERFLOW != 0 {
350
+ /// // handle overflow
351
+ /// }
352
+ /// ```
353
+ ///
354
+ /// ## Masking Flags
355
+ ///
356
+ /// There is one masking flag for each exception flag: `_MM_MASK_INVALID`,
357
+ /// `_MM_MASK_DENORM`, `_MM_MASK_DIV_ZERO`, `_MM_MASK_OVERFLOW`,
358
+ /// `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`.
359
+ ///
360
+ /// A single masking bit can be set via
361
+ ///
362
+ /// ```rust,ignore
363
+ /// _MM_SET_EXCEPTION_MASK(_MM_MASK_UNDERFLOW);
364
+ /// ```
365
+ ///
366
+ /// However, since mask bits are by default all set to 1, it is more common to
367
+ /// want to *disable* certain bits. For example, to unmask the underflow
368
+ /// exception, use:
369
+ ///
370
+ /// ```rust,ignore
371
+ /// _mm_setcsr(_mm_getcsr() & !_MM_MASK_UNDERFLOW); // unmask underflow exception
372
+ /// ```
373
+ ///
374
+ /// Warning: an unmasked exception will cause an exception handler to be called.
375
+ /// The standard handler will simply terminate the process. So, in this case
376
+ /// any underflow exception would terminate the current process with something
377
+ /// like `signal: 8, SIGFPE: erroneous arithmetic operation`.
378
+ ///
379
+ /// ## Rounding Mode
380
+ ///
381
+ /// The rounding mode is describe using two bits. It can be read and set using
382
+ /// the convenience wrappers `_MM_GET_ROUNDING_MODE()` and
383
+ /// `_MM_SET_ROUNDING_MODE(mode)`.
384
+ ///
385
+ /// The rounding modes are:
386
+ ///
387
+ /// * `_MM_ROUND_NEAREST`: (default) Round to closest to the infinite precision
388
+ /// value. If two values are equally close, round to even (i.e., least
389
+ /// significant bit will be zero).
390
+ ///
391
+ /// * `_MM_ROUND_DOWN`: Round toward negative Infinity.
392
+ ///
393
+ /// * `_MM_ROUND_UP`: Round toward positive Infinity.
394
+ ///
395
+ /// * `_MM_ROUND_TOWARD_ZERO`: Round towards zero (truncate).
396
+ ///
397
+ /// Example:
398
+ ///
399
+ /// ```rust,ignore
400
+ /// _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN)
401
+ /// ```
402
+ ///
403
+ /// ## Denormals-are-zero/Flush-to-zero Mode
404
+ ///
405
+ /// If this bit is set, values that would be denormalized will be set to zero
406
+ /// instead. This is turned off by default.
407
+ ///
408
+ /// You can read and enable/disable this mode via the helper functions
409
+ /// `_MM_GET_FLUSH_ZERO_MODE()` and `_MM_SET_FLUSH_ZERO_MODE()`:
410
+ ///
411
+ /// ```rust,ignore
412
+ /// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); // turn off (default)
413
+ /// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // turn on
414
+ /// ```
415
+ ///
416
+ #[ inline( always) ]
417
+ #[ target_feature = "+sse" ]
418
+ #[ cfg_attr( test, assert_instr( ldmxcsr) ) ]
419
+ pub unsafe fn _mm_setcsr ( val : u32 ) {
420
+ ldmxcsr ( & val as * const _ as * const i8 ) ;
421
+ }
422
+
423
+ /// See [`_mm_setcsr`](fn._mm_setcsr.html)
424
+ pub const _MM_EXCEPT_INVALID: u32 = 0x0001 ;
425
+ /// See [`_mm_setcsr`](fn._mm_setcsr.html)
426
+ pub const _MM_EXCEPT_DENORM: u32 = 0x0002 ;
427
+ /// See [`_mm_setcsr`](fn._mm_setcsr.html)
428
+ pub const _MM_EXCEPT_DIV_ZERO: u32 = 0x0004 ;
429
+ /// See [`_mm_setcsr`](fn._mm_setcsr.html)
430
+ pub const _MM_EXCEPT_OVERFLOW: u32 = 0x0008 ;
431
+ /// See [`_mm_setcsr`](fn._mm_setcsr.html)
432
+ pub const _MM_EXCEPT_UNDERFLOW: u32 = 0x0010 ;
433
+ /// See [`_mm_setcsr`](fn._mm_setcsr.html)
434
+ pub const _MM_EXCEPT_INEXACT: u32 = 0x0020 ;
435
+ pub const _MM_EXCEPT_MASK: u32 = 0x003f ;
436
+
437
+ /// See [`_mm_setcsr`](fn._mm_setcsr.html)
438
+ pub const _MM_MASK_INVALID: u32 = 0x0080 ;
439
+ /// See [`_mm_setcsr`](fn._mm_setcsr.html)
440
+ pub const _MM_MASK_DENORM: u32 = 0x0100 ;
441
+ /// See [`_mm_setcsr`](fn._mm_setcsr.html)
442
+ pub const _MM_MASK_DIV_ZERO: u32 = 0x0200 ;
443
+ /// See [`_mm_setcsr`](fn._mm_setcsr.html)
444
+ pub const _MM_MASK_OVERFLOW: u32 = 0x0400 ;
445
+ /// See [`_mm_setcsr`](fn._mm_setcsr.html)
446
+ pub const _MM_MASK_UNDERFLOW: u32 = 0x0800 ;
447
+ /// See [`_mm_setcsr`](fn._mm_setcsr.html)
448
+ pub const _MM_MASK_INEXACT: u32 = 0x1000 ;
449
+ pub const _MM_MASK_MASK: u32 = 0x1f80 ;
450
+
451
+ /// See [`_mm_setcsr`](fn._mm_setcsr.html)
452
+ pub const _MM_ROUND_NEAREST: u32 = 0x0000 ;
453
+ /// See [`_mm_setcsr`](fn._mm_setcsr.html)
454
+ pub const _MM_ROUND_DOWN: u32 = 0x2000 ;
455
+ /// See [`_mm_setcsr`](fn._mm_setcsr.html)
456
+ pub const _MM_ROUND_UP: u32 = 0x4000 ;
457
+ /// See [`_mm_setcsr`](fn._mm_setcsr.html)
458
+ pub const _MM_ROUND_TOWARD_ZERO: u32 = 0x6000 ;
459
+ pub const _MM_ROUND_MASK: u32 = 0x6000 ;
460
+
461
+ pub const _MM_FLUSH_ZERO_MASK: u32 = 0x8000 ;
462
+ /// See [`_mm_setcsr`](fn._mm_setcsr.html)
463
+ pub const _MM_FLUSH_ZERO_ON: u32 = 0x8000 ;
464
+ /// See [`_mm_setcsr`](fn._mm_setcsr.html)
465
+ pub const _MM_FLUSH_ZERO_OFF: u32 = 0x0000 ;
466
+
467
+ #[ inline( always) ]
468
+ #[ allow( non_snake_case) ]
469
+ #[ target_feature = "+sse" ]
470
+ pub unsafe fn _MM_GET_EXCEPTION_MASK ( ) -> u32 {
471
+ _mm_getcsr ( ) & _MM_MASK_MASK
472
+ }
473
+
474
+ #[ inline( always) ]
475
+ #[ allow( non_snake_case) ]
476
+ #[ target_feature = "+sse" ]
477
+ pub unsafe fn _MM_GET_EXCEPTION_STATE ( ) -> u32 {
478
+ _mm_getcsr ( ) & _MM_EXCEPT_MASK
479
+ }
480
+
481
+ #[ inline( always) ]
482
+ #[ allow( non_snake_case) ]
483
+ #[ target_feature = "+sse" ]
484
+ pub unsafe fn _MM_GET_FLUSH_ZERO_MODE ( ) -> u32 {
485
+ _mm_getcsr ( ) & _MM_FLUSH_ZERO_MASK
486
+ }
487
+
488
+ #[ inline( always) ]
489
+ #[ allow( non_snake_case) ]
490
+ #[ target_feature = "+sse" ]
491
+ pub unsafe fn _MM_GET_ROUNDING_MODE ( ) -> u32 {
492
+ _mm_getcsr ( ) & _MM_ROUND_MASK
493
+ }
494
+
495
+ #[ inline( always) ]
496
+ #[ allow( non_snake_case) ]
497
+ #[ target_feature = "+sse" ]
498
+ pub unsafe fn _MM_SET_EXCEPTION_MASK ( x : u32 ) {
499
+ _mm_setcsr ( ( _mm_getcsr ( ) & !_MM_MASK_MASK) | x)
500
+ }
501
+
502
+ #[ inline( always) ]
503
+ #[ allow( non_snake_case) ]
504
+ #[ target_feature = "+sse" ]
505
+ pub unsafe fn _MM_SET_EXCEPTION_STATE ( x : u32 ) {
506
+ _mm_setcsr ( ( _mm_getcsr ( ) & !_MM_EXCEPT_MASK) | x)
507
+ }
508
+
509
+ #[ inline( always) ]
510
+ #[ allow( non_snake_case) ]
511
+ #[ target_feature = "+sse" ]
512
+ pub unsafe fn _MM_SET_FLUSH_ZERO_MODE ( x : u32 ) {
513
+ let val = ( _mm_getcsr ( ) & !_MM_FLUSH_ZERO_MASK) | x;
514
+ //println!("setting csr={:x}", val);
515
+ _mm_setcsr ( val)
516
+ }
517
+
518
+ #[ inline( always) ]
519
+ #[ allow( non_snake_case) ]
520
+ #[ target_feature = "+sse" ]
521
+ pub unsafe fn _MM_SET_ROUNDING_MODE ( x : u32 ) {
522
+ _mm_setcsr ( ( _mm_getcsr ( ) & !_MM_ROUND_MASK) | x)
523
+ }
271
524
272
525
/// See [`_mm_prefetch`](fn._mm_prefetch.html).
273
526
pub const _MM_HINT_T0: i8 = 3 ;
@@ -374,6 +627,12 @@ extern {
374
627
fn maxps ( a : f32x4 , b : f32x4 ) -> f32x4 ;
375
628
#[ link_name = "llvm.x86.sse.movmsk.ps" ]
376
629
fn movmskps ( a : f32x4 ) -> i32 ;
630
+ #[ link_name = "llvm.x86.sse.sfence" ]
631
+ fn sfence ( ) ;
632
+ #[ link_name = "llvm.x86.sse.stmxcsr" ]
633
+ fn stmxcsr ( p : * mut i8 ) ;
634
+ #[ link_name = "llvm.x86.sse.ldmxcsr" ]
635
+ fn ldmxcsr ( p : * const i8 ) ;
377
636
#[ link_name = "llvm.prefetch" ]
378
637
fn prefetch ( p : * const c_void , rw : i32 , loc : i32 , ty : i32 ) ;
379
638
}
@@ -383,6 +642,7 @@ mod tests {
383
642
use v128:: * ;
384
643
use x86:: sse;
385
644
use stdsimd_test:: simd_test;
645
+ use test:: black_box; // Used to inhibit constant-folding.
386
646
387
647
#[ simd_test = "sse" ]
388
648
unsafe fn _mm_add_ps ( ) {
@@ -577,4 +837,62 @@ mod tests {
577
837
let r = sse:: _mm_movemask_ps ( f32x4:: new ( -1.0 , -5.0 , -5.0 , 0.0 ) ) ;
578
838
assert_eq ! ( r, 0b0111 ) ;
579
839
}
840
+
841
+ #[ simd_test = "sse" ]
842
+ unsafe fn _mm_sfence ( ) {
843
+ sse:: _mm_sfence ( ) ;
844
+ }
845
+
846
+ #[ simd_test = "sse" ]
847
+ unsafe fn _mm_getcsr_setcsr_1 ( ) {
848
+ let saved_csr = sse:: _mm_getcsr ( ) ;
849
+
850
+ let a = f32x4:: new ( 1.1e-36 , 0.0 , 0.0 , 1.0 ) ;
851
+ let b = f32x4:: new ( 0.001 , 0.0 , 0.0 , 1.0 ) ;
852
+
853
+ sse:: _MM_SET_FLUSH_ZERO_MODE ( sse:: _MM_FLUSH_ZERO_ON) ;
854
+ let r = sse:: _mm_mul_ps ( black_box ( a) , black_box ( b) ) ;
855
+
856
+ sse:: _mm_setcsr ( saved_csr) ;
857
+
858
+ let exp = f32x4:: new ( 0.0 , 0.0 , 0.0 , 1.0 ) ;
859
+ assert_eq ! ( r, exp) ; // first component is a denormalized f32
860
+ }
861
+
862
+ #[ simd_test = "sse" ]
863
+ unsafe fn _mm_getcsr_setcsr_2 ( ) {
864
+ // Same as _mm_setcsr_1 test, but with opposite flag value.
865
+
866
+ let saved_csr = sse:: _mm_getcsr ( ) ;
867
+
868
+ let a = f32x4:: new ( 1.1e-36 , 0.0 , 0.0 , 1.0 ) ;
869
+ let b = f32x4:: new ( 0.001 , 0.0 , 0.0 , 1.0 ) ;
870
+
871
+ sse:: _MM_SET_FLUSH_ZERO_MODE ( sse:: _MM_FLUSH_ZERO_OFF) ;
872
+ let r = sse:: _mm_mul_ps ( black_box ( a) , black_box ( b) ) ;
873
+
874
+ sse:: _mm_setcsr ( saved_csr) ;
875
+
876
+ let exp = f32x4:: new ( 1.1e-39 , 0.0 , 0.0 , 1.0 ) ;
877
+ assert_eq ! ( r, exp) ; // first component is a denormalized f32
878
+ }
879
+
880
+ #[ simd_test = "sse" ]
881
+ unsafe fn _mm_getcsr_setcsr_underflow ( ) {
882
+ sse:: _MM_SET_EXCEPTION_STATE ( 0 ) ;
883
+
884
+ let a = f32x4:: new ( 1.1e-36 , 0.0 , 0.0 , 1.0 ) ;
885
+ let b = f32x4:: new ( 1e-5 , 0.0 , 0.0 , 1.0 ) ;
886
+
887
+ assert_eq ! ( sse:: _MM_GET_EXCEPTION_STATE( ) , 0 ) ; // just to be sure
888
+
889
+ let r = sse:: _mm_mul_ps ( black_box ( a) , black_box ( b) ) ;
890
+
891
+ let exp = f32x4:: new ( 1.1e-41 , 0.0 , 0.0 , 1.0 ) ;
892
+ assert_eq ! ( r, exp) ;
893
+
894
+ let underflow =
895
+ sse:: _MM_GET_EXCEPTION_STATE ( ) & sse:: _MM_EXCEPT_UNDERFLOW != 0 ;
896
+ assert_eq ! ( underflow, true ) ;
897
+ }
580
898
}
0 commit comments