diff --git a/crates/core_arch/src/x86/avx.rs b/crates/core_arch/src/x86/avx.rs index fba2901601..82fe0acd23 100644 --- a/crates/core_arch/src/x86/avx.rs +++ b/crates/core_arch/src/x86/avx.rs @@ -1718,7 +1718,12 @@ pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vmovntps))] // FIXME vmovntdq #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) { - intrinsics::nontemporal_store(mem_addr, a); + crate::arch::asm!( + "vmovntps [{mem_addr}], {a}", + mem_addr = in(reg) mem_addr, + a = in(ymm_reg) a, + options(nostack, preserves_flags), + ); } /// Moves double-precision values from a 256-bit vector of `[4 x double]` @@ -1741,7 +1746,12 @@ pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) { #[stable(feature = "simd_x86", since = "1.27.0")] #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) { - intrinsics::nontemporal_store(mem_addr as *mut __m256d, a); + crate::arch::asm!( + "vmovntps [{mem_addr}], {a}", + mem_addr = in(reg) mem_addr, + a = in(ymm_reg) a, + options(nostack, preserves_flags), + ); } /// Moves single-precision floating point values from a 256-bit vector @@ -1765,7 +1775,12 @@ pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) { #[stable(feature = "simd_x86", since = "1.27.0")] #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) { - intrinsics::nontemporal_store(mem_addr as *mut __m256, a); + crate::arch::asm!( + "vmovntps [{mem_addr}], {a}", + mem_addr = in(reg) mem_addr, + a = in(ymm_reg) a, + options(nostack, preserves_flags), + ); } /// Computes the approximate reciprocal of packed single-precision (32-bit) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index a37cda1c51..886c533d79 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -28014,7 +28014,12 @@ pub unsafe fn _mm_mask_testn_epi64_mask(k: __mmask8, a: __m128i, b: __m128i) -> #[cfg_attr(test, assert_instr(vmovntps))] #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) { - intrinsics::nontemporal_store(mem_addr as *mut __m512, a); + crate::arch::asm!( + "vmovntps [{mem_addr}], {a}", + mem_addr = in(reg) mem_addr, + a = in(zmm_reg) a, + options(nostack, preserves_flags), + ); } /// Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated. @@ -28035,7 +28040,12 @@ pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) { #[cfg_attr(test, assert_instr(vmovntps))] //should be vmovntpd #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) { - intrinsics::nontemporal_store(mem_addr as *mut __m512d, a); + crate::arch::asm!( + "vmovntps [{mem_addr}], {a}", + mem_addr = in(reg) mem_addr, + a = in(zmm_reg) a, + options(nostack, preserves_flags), + ); } /// Store 512-bits of integer data from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated. @@ -28056,7 +28066,12 @@ pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) { #[cfg_attr(test, assert_instr(vmovntps))] //should be vmovntdq #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm512_stream_si512(mem_addr: *mut i64, a: __m512i) { - intrinsics::nontemporal_store(mem_addr as *mut __m512i, a); + crate::arch::asm!( + "vmovntps [{mem_addr}], {a}", + mem_addr = in(reg) mem_addr, + a = in(zmm_reg) a, + options(nostack, preserves_flags), + ); } /// Sets packed 32-bit integers in `dst` with the supplied values. @@ -54373,9 +54388,9 @@ mod tests { #[simd_test(enable = "avx512f")] unsafe fn test_mm512_stream_ps() { - #[repr(align(32))] + #[repr(align(64))] struct Memory { - pub data: [f32; 16], + pub data: [f32; 16], // 64 bytes } let a = _mm512_set1_ps(7.0); let mut mem = Memory { data: [-1.0; 16] }; diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs index e04b7910d2..e30816b508 100644 --- a/crates/core_arch/src/x86/mod.rs +++ b/crates/core_arch/src/x86/mod.rs @@ -2,7 +2,7 @@ #[allow(unused_imports)] use crate::marker::Sized; -use crate::{intrinsics, mem::transmute}; +use crate::mem::transmute; #[macro_use] mod macros; diff --git a/crates/core_arch/src/x86/sse.rs b/crates/core_arch/src/x86/sse.rs index bee8291dcc..ecba71fb58 100644 --- a/crates/core_arch/src/x86/sse.rs +++ b/crates/core_arch/src/x86/sse.rs @@ -2002,7 +2002,12 @@ extern "C" { #[stable(feature = "simd_x86", since = "1.27.0")] #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) { - intrinsics::nontemporal_store(mem_addr as *mut __m128, a); + crate::arch::asm!( + "movntps [{mem_addr}], {a}", + mem_addr = in(reg) mem_addr, + a = in(xmm_reg) a, + options(nostack, preserves_flags), + ); } #[cfg(test)] diff --git a/crates/core_arch/src/x86/sse2.rs b/crates/core_arch/src/x86/sse2.rs index 70750e371d..639ea78894 100644 --- a/crates/core_arch/src/x86/sse2.rs +++ b/crates/core_arch/src/x86/sse2.rs @@ -1327,11 +1327,16 @@ pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) { /// /// See [`_mm_sfence`] for details. #[inline] -#[target_feature(enable = "sse2")] +#[target_feature(enable = "sse,sse2")] #[cfg_attr(test, assert_instr(movntps))] // FIXME movntdq #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) { - intrinsics::nontemporal_store(mem_addr, a); + crate::arch::asm!( + "movntps [{mem_addr}], {a}", + mem_addr = in(reg) mem_addr, + a = in(xmm_reg) a, + options(nostack, preserves_flags), + ); } /// Stores a 32-bit integer value in the specified memory location. @@ -1353,7 +1358,12 @@ pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) { #[cfg_attr(test, assert_instr(movnti))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) { - intrinsics::nontemporal_store(mem_addr, a); + crate::arch::asm!( + "movnti [{mem_addr}], {a:e}", // `:e` for 32bit value + mem_addr = in(reg) mem_addr, + a = in(reg) a, + options(nostack, preserves_flags), + ); } /// Returns a vector where the low element is extracted from `a` and its upper @@ -2543,12 +2553,17 @@ pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d { /// /// See [`_mm_sfence`] for details. #[inline] -#[target_feature(enable = "sse2")] +#[target_feature(enable = "sse,sse2")] #[cfg_attr(test, assert_instr(movntps))] // FIXME movntpd #[stable(feature = "simd_x86", since = "1.27.0")] #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) { - intrinsics::nontemporal_store(mem_addr as *mut __m128d, a); + crate::arch::asm!( + "movntps [{mem_addr}], {a}", + mem_addr = in(reg) mem_addr, + a = in(xmm_reg) a, + options(nostack, preserves_flags), + ); } /// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a diff --git a/crates/core_arch/src/x86_64/sse2.rs b/crates/core_arch/src/x86_64/sse2.rs index ea3dbc41bc..22788b275a 100644 --- a/crates/core_arch/src/x86_64/sse2.rs +++ b/crates/core_arch/src/x86_64/sse2.rs @@ -1,9 +1,6 @@ //! `x86_64`'s Streaming SIMD Extensions 2 (SSE2) -use crate::{ - core_arch::x86::*, - intrinsics::{self, simd::*}, -}; +use crate::{core_arch::x86::*, intrinsics::simd::*}; #[cfg(test)] use stdarch_test::assert_instr; @@ -81,7 +78,12 @@ pub unsafe fn _mm_cvttsd_si64x(a: __m128d) -> i64 { #[cfg_attr(test, assert_instr(movnti))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_stream_si64(mem_addr: *mut i64, a: i64) { - intrinsics::nontemporal_store(mem_addr, a); + crate::arch::asm!( + "movnti [{mem_addr}], {a}", + mem_addr = in(reg) mem_addr, + a = in(reg) a, + options(nostack, preserves_flags), + ); } /// Returns a vector whose lowest element is `a` and all higher elements are