Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
143 changes: 103 additions & 40 deletions crates/core_arch/src/x86/avx2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2565,35 +2565,67 @@ pub unsafe fn _mm256_slli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_si256)
#[inline]
#[target_feature(enable = "avx2")]
#[cfg_attr(test, assert_instr(vpslldq, imm8 = 3))]
#[rustc_args_required_const(1)]
#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
#[rustc_legacy_const_generics(1)]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_slli_si256(a: __m256i, imm8: i32) -> __m256i {
pub unsafe fn _mm256_slli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
static_assert_imm8!(IMM8);
let a = a.as_i64x4();
macro_rules! call {
($imm8:expr) => {
vpslldq(a, $imm8)
};
}
transmute(constify_imm8!(imm8 * 8, call))
let r = vpslldq(a, IMM8 * 8);
transmute(r)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can just call _mm256_bslli_epi128 here.

}

/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bslli_epi128)
#[inline]
#[target_feature(enable = "avx2")]
#[cfg_attr(test, assert_instr(vpslldq, imm8 = 3))]
#[rustc_args_required_const(1)]
#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
#[rustc_legacy_const_generics(1)]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_bslli_epi128(a: __m256i, imm8: i32) -> __m256i {
let a = a.as_i64x4();
macro_rules! call {
($imm8:expr) => {
vpslldq(a, $imm8)
};
}
transmute(constify_imm8!(imm8 * 8, call))
pub unsafe fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
static_assert_imm8!(IMM8);
let a = a.as_i8x32();
let zero = _mm256_setzero_si256().as_i8x32();
let r: i8x32 = simd_shuffle32(
zero,
a,
[
32 - (IMM8 as u32 & 0xff),
33 - (IMM8 as u32 & 0xff),
34 - (IMM8 as u32 & 0xff),
35 - (IMM8 as u32 & 0xff),
36 - (IMM8 as u32 & 0xff),
37 - (IMM8 as u32 & 0xff),
38 - (IMM8 as u32 & 0xff),
39 - (IMM8 as u32 & 0xff),
40 - (IMM8 as u32 & 0xff),
41 - (IMM8 as u32 & 0xff),
42 - (IMM8 as u32 & 0xff),
43 - (IMM8 as u32 & 0xff),
44 - (IMM8 as u32 & 0xff),
45 - (IMM8 as u32 & 0xff),
46 - (IMM8 as u32 & 0xff),
47 - (IMM8 as u32 & 0xff),
48 - (IMM8 as u32 & 0xff) - 16,
49 - (IMM8 as u32 & 0xff) - 16,
50 - (IMM8 as u32 & 0xff) - 16,
51 - (IMM8 as u32 & 0xff) - 16,
52 - (IMM8 as u32 & 0xff) - 16,
53 - (IMM8 as u32 & 0xff) - 16,
54 - (IMM8 as u32 & 0xff) - 16,
55 - (IMM8 as u32 & 0xff) - 16,
56 - (IMM8 as u32 & 0xff) - 16,
57 - (IMM8 as u32 & 0xff) - 16,
58 - (IMM8 as u32 & 0xff) - 16,
59 - (IMM8 as u32 & 0xff) - 16,
60 - (IMM8 as u32 & 0xff) - 16,
61 - (IMM8 as u32 & 0xff) - 16,
62 - (IMM8 as u32 & 0xff) - 16,
63 - (IMM8 as u32 & 0xff) - 16,
],
);
transmute(r)
}

/// Shifts packed 32-bit integers in `a` left by the amount
Expand Down Expand Up @@ -2729,35 +2761,66 @@ pub unsafe fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_si256)
#[inline]
#[target_feature(enable = "avx2")]
#[cfg_attr(test, assert_instr(vpsrldq, imm8 = 3))]
#[rustc_args_required_const(1)]
#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 3))]
#[rustc_legacy_const_generics(1)]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_srli_si256(a: __m256i, imm8: i32) -> __m256i {
pub unsafe fn _mm256_srli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
let a = a.as_i64x4();
macro_rules! call {
($imm8:expr) => {
vpsrldq(a, $imm8)
};
}
transmute(constify_imm8!(imm8 * 8, call))
let r = vpsrldq(a, IMM8 * 8);
transmute(r)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can just call _mm256_bsrli_epi128 here.

}

/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bsrli_epi128)
#[inline]
#[target_feature(enable = "avx2")]
#[cfg_attr(test, assert_instr(vpsrldq, imm8 = 3))]
#[rustc_args_required_const(1)]
#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 3))]
#[rustc_legacy_const_generics(1)]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_bsrli_epi128(a: __m256i, imm8: i32) -> __m256i {
let a = a.as_i64x4();
macro_rules! call {
($imm8:expr) => {
vpsrldq(a, $imm8)
};
}
transmute(constify_imm8!(imm8 * 8, call))
pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
static_assert_imm8!(IMM8);
let a = a.as_i8x32();
let zero = _mm256_setzero_si256().as_i8x32();
let r: i8x32 = simd_shuffle32(
a,
zero,
[
0 + (IMM8 as u32 & 0xff) + 16,
1 + (IMM8 as u32 & 0xff) + 16,
2 + (IMM8 as u32 & 0xff) + 16,
3 + (IMM8 as u32 & 0xff) + 16,
4 + (IMM8 as u32 & 0xff) + 16,
5 + (IMM8 as u32 & 0xff) + 16,
6 + (IMM8 as u32 & 0xff) + 16,
7 + (IMM8 as u32 & 0xff) + 16,
8 + (IMM8 as u32 & 0xff) + 16,
9 + (IMM8 as u32 & 0xff) + 16,
10 + (IMM8 as u32 & 0xff) + 16,
11 + (IMM8 as u32 & 0xff) + 16,
12 + (IMM8 as u32 & 0xff) + 16,
13 + (IMM8 as u32 & 0xff) + 16,
14 + (IMM8 as u32 & 0xff) + 16,
15 + (IMM8 as u32 & 0xff) + 16,
16 + (IMM8 as u32 & 0xff),
17 + (IMM8 as u32 & 0xff),
18 + (IMM8 as u32 & 0xff),
19 + (IMM8 as u32 & 0xff),
20 + (IMM8 as u32 & 0xff),
21 + (IMM8 as u32 & 0xff),
22 + (IMM8 as u32 & 0xff),
23 + (IMM8 as u32 & 0xff),
24 + (IMM8 as u32 & 0xff),
25 + (IMM8 as u32 & 0xff),
26 + (IMM8 as u32 & 0xff),
27 + (IMM8 as u32 & 0xff),
28 + (IMM8 as u32 & 0xff),
29 + (IMM8 as u32 & 0xff),
30 + (IMM8 as u32 & 0xff),
31 + (IMM8 as u32 & 0xff),
],
);
transmute(r)
}

/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
Expand Down Expand Up @@ -4824,7 +4887,7 @@ mod tests {
#[simd_test(enable = "avx2")]
unsafe fn test_mm256_slli_si256() {
let a = _mm256_set1_epi64x(0xFFFFFFFF);
let r = _mm256_slli_si256(a, 3);
let r = _mm256_slli_si256::<3>(a);
assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF000000));
}

Expand Down Expand Up @@ -4923,7 +4986,7 @@ mod tests {
17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 32,
);
let r = _mm256_srli_si256(a, 3);
let r = _mm256_srli_si256::<3>(a);
#[rustfmt::skip]
let e = _mm256_setr_epi8(
4, 5, 6, 7, 8, 9, 10, 11,
Expand Down
56 changes: 17 additions & 39 deletions crates/core_arch/src/x86/f16c.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

use crate::{
core_arch::{simd::*, x86::*},
hint::unreachable_unchecked,
// hint::unreachable_unchecked,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Deleted commented code.

mem::transmute,
};

Expand Down Expand Up @@ -42,22 +42,6 @@ pub unsafe fn _mm256_cvtph_ps(a: __m128i) -> __m256 {
transmute(llvm_vcvtph2ps_256(transmute(a)))
}

macro_rules! dispatch_rounding {
($rounding:ident, $call:ident) => {{
match $rounding {
0 => call!(0),
1 => call!(1),
2 => call!(2),
3 => call!(3),
4 => call!(4),
5 => call!(5),
6 => call!(6),
7 => call!(7),
_ => unreachable_unchecked(),
}
}};
}

/// Converts the 4 x 32-bit float values in the 128-bit vector `a` into 4 x
/// 16-bit half-precision float values stored in the lowest 64-bit of a 128-bit
/// vector.
Expand All @@ -71,16 +55,13 @@ macro_rules! dispatch_rounding {
/// * `_MM_FROUND_CUR_DIRECTION`: use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`].
#[inline]
#[target_feature(enable = "f16c")]
#[rustc_args_required_const(1)]
#[cfg_attr(test, assert_instr("vcvtps2ph", imm_rounding = 0))]
pub unsafe fn _mm_cvtps_ph(a: __m128, imm_rounding: i32) -> __m128i {
let a = transmute(a);
macro_rules! call {
($rounding:expr) => {
llvm_vcvtps2ph_128(a, $rounding)
};
}
transmute(dispatch_rounding!(imm_rounding, call))
#[cfg_attr(test, assert_instr("vcvtps2ph", IMM_ROUNDING = 0))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm_cvtps_ph<const IMM_ROUNDING: i32>(a: __m128) -> __m128i {
static_assert_imm3!(IMM_ROUNDING);
let a = a.as_f32x4();
let r = llvm_vcvtps2ph_128(a, IMM_ROUNDING);
transmute(r)
}

/// Converts the 8 x 32-bit float values in the 256-bit vector `a` into 8 x
Expand All @@ -95,16 +76,13 @@ pub unsafe fn _mm_cvtps_ph(a: __m128, imm_rounding: i32) -> __m128i {
/// * `_MM_FROUND_CUR_DIRECTION`: use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`].
#[inline]
#[target_feature(enable = "f16c")]
#[rustc_args_required_const(1)]
#[cfg_attr(test, assert_instr("vcvtps2ph", imm_rounding = 0))]
pub unsafe fn _mm256_cvtps_ph(a: __m256, imm_rounding: i32) -> __m128i {
let a = transmute(a);
macro_rules! call {
($rounding:expr) => {
llvm_vcvtps2ph_256(a, $rounding)
};
}
transmute(dispatch_rounding!(imm_rounding, call))
#[cfg_attr(test, assert_instr("vcvtps2ph", IMM_ROUNDING = 0))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm256_cvtps_ph<const IMM_ROUNDING: i32>(a: __m256) -> __m128i {
static_assert_imm3!(IMM_ROUNDING);
let a = a.as_f32x8();
let r = llvm_vcvtps2ph_256(a, IMM_ROUNDING);
transmute(r)
}

#[cfg(test)]
Expand All @@ -116,7 +94,7 @@ mod tests {
unsafe fn test_mm_cvtph_ps() {
let array = [1_f32, 2_f32, 3_f32, 4_f32];
let float_vec: __m128 = transmute(array);
let halfs: __m128i = _mm_cvtps_ph(float_vec, 0);
let halfs: __m128i = _mm_cvtps_ph::<0>(float_vec);
let floats: __m128 = _mm_cvtph_ps(halfs);
let result: [f32; 4] = transmute(floats);
assert_eq!(result, array);
Expand All @@ -126,7 +104,7 @@ mod tests {
unsafe fn test_mm256_cvtph_ps() {
let array = [1_f32, 2_f32, 3_f32, 4_f32, 5_f32, 6_f32, 7_f32, 8_f32];
let float_vec: __m256 = transmute(array);
let halfs: __m128i = _mm256_cvtps_ph(float_vec, 0);
let halfs: __m128i = _mm256_cvtps_ph::<0>(float_vec);
let floats: __m256 = _mm256_cvtph_ps(halfs);
let result: [f32; 8] = transmute(floats);
assert_eq!(result, array);
Expand Down
Loading