Skip to content

Commit 2e4cce4

Browse files
hdevalencealexcrichton
authored andcommitted
avx2: add _mm256_unpack{hi,lo}_epi{8,16,32,64} (rust-lang#147)
1 parent 4ca3e8d commit 2e4cce4

File tree

1 file changed

+311
-8
lines changed

1 file changed

+311
-8
lines changed

src/x86/avx2.rs

Lines changed: 311 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1402,14 +1402,317 @@ pub unsafe fn _mm256_subs_epu8(a: u8x32, b: u8x32) -> u8x32 {
14021402
psubusb(a, b)
14031403
}
14041404

1405-
// TODO __mm256_unpackhi_epi16 (__m256i a, __m256i b)
1406-
// TODO __m256i _mm256_unpackhi_epi32 (__m256i a, __m256i b)
1407-
// TODO __m256i _mm256_unpackhi_epi64 (__m256i a, __m256i b)
1408-
// TODO __m256i _mm256_unpackhi_epi8 (__m256i a, __m256i b)
1409-
// TODO __m256i _mm256_unpacklo_epi16 (__m256i a, __m256i b)
1410-
// TODO __m256i _mm256_unpacklo_epi32 (__m256i a, __m256i b)
1411-
// TODO __m256i _mm256_unpacklo_epi64 (__m256i a, __m256i b)
1412-
// TODO __m256i _mm256_unpacklo_epi8 (__m256i a, __m256i b)
1405+
/// Unpack and interleave 8-bit integers from the high half of each
1406+
/// 128-bit lane in `a` and `b`.
1407+
///
1408+
/// ```rust
1409+
/// # #![feature(cfg_target_feature)]
1410+
/// # #![feature(target_feature)]
1411+
/// #
1412+
/// # #[macro_use] extern crate stdsimd;
1413+
/// #
1414+
/// # fn main() {
1415+
/// # if cfg_feature_enabled!("avx2") {
1416+
/// # #[target_feature = "+avx2"]
1417+
/// # fn worker() {
1418+
/// use stdsimd::simd::i8x32;
1419+
/// use stdsimd::vendor::_mm256_unpackhi_epi8;
1420+
///
1421+
/// let a = i8x32::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
1422+
/// let b = i8x32::new(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31);
1423+
///
1424+
/// let c: i8x32;
1425+
/// unsafe {
1426+
/// c = _mm256_unpackhi_epi8(a, b);
1427+
/// }
1428+
///
1429+
/// let expected = i8x32::new(8,-8, 9,-9, 10,-10, 11,-11, 12,-12, 13,-13, 14,-14, 15,-15, 24,-24, 25,-25, 26,-26, 27,-27, 28,-28, 29,-29, 30,-30, 31,-31);
1430+
/// assert_eq!(c, expected);
1431+
///
1432+
/// # }
1433+
/// # worker();
1434+
/// # }
1435+
/// # }
1436+
/// ```
1437+
#[inline(always)]
1438+
#[target_feature = "+avx2"]
1439+
#[cfg_attr(test, assert_instr(vpunpckhbw))]
1440+
pub unsafe fn _mm256_unpackhi_epi8(a: i8x32, b: i8x32) -> i8x32 {
1441+
simd_shuffle32(a, b, [8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63])
1442+
}
1443+
1444+
/// Unpack and interleave 8-bit integers from the low half of each
1445+
/// 128-bit lane of `a` and `b`.
1446+
///
1447+
/// ```rust
1448+
/// # #![feature(cfg_target_feature)]
1449+
/// # #![feature(target_feature)]
1450+
/// #
1451+
/// # #[macro_use] extern crate stdsimd;
1452+
/// #
1453+
/// # fn main() {
1454+
/// # if cfg_feature_enabled!("avx2") {
1455+
/// # #[target_feature = "+avx2"]
1456+
/// # fn worker() {
1457+
/// use stdsimd::simd::i8x32;
1458+
/// use stdsimd::vendor::_mm256_unpacklo_epi8;
1459+
///
1460+
/// let a = i8x32::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
1461+
/// let b = i8x32::new(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31);
1462+
///
1463+
/// let c: i8x32;
1464+
/// unsafe {
1465+
/// c = _mm256_unpacklo_epi8(a, b);
1466+
/// }
1467+
///
1468+
/// let expected = i8x32::new(0, 0, 1,-1, 2,-2, 3,-3, 4,-4, 5,-5, 6,-6, 7,-7, 16,-16, 17,-17, 18,-18, 19,-19, 20,-20, 21,-21, 22,-22, 23,-23);
1469+
/// assert_eq!(c, expected);
1470+
///
1471+
/// # }
1472+
/// # worker();
1473+
/// # }
1474+
/// # }
1475+
/// ```
1476+
#[inline(always)]
1477+
#[target_feature = "+avx2"]
1478+
#[cfg_attr(test, assert_instr(vpunpcklbw))]
1479+
pub unsafe fn _mm256_unpacklo_epi8(a: i8x32, b: i8x32) -> i8x32 {
1480+
simd_shuffle32(a, b, [0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55])
1481+
}
1482+
1483+
/// Unpack and interleave 16-bit integers from the high half of each
1484+
/// 128-bit lane of `a` and `b`.
1485+
///
1486+
/// ```rust
1487+
/// # #![feature(cfg_target_feature)]
1488+
/// # #![feature(target_feature)]
1489+
/// #
1490+
/// # #[macro_use] extern crate stdsimd;
1491+
/// #
1492+
/// # fn main() {
1493+
/// # if cfg_feature_enabled!("avx2") {
1494+
/// # #[target_feature = "+avx2"]
1495+
/// # fn worker() {
1496+
/// use stdsimd::simd::i16x16;
1497+
/// use stdsimd::vendor::_mm256_unpackhi_epi16;
1498+
///
1499+
/// let a = i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1500+
/// let b = i16x16::new(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15);
1501+
///
1502+
/// let c: i16x16;
1503+
/// unsafe {
1504+
/// c = _mm256_unpackhi_epi16(a, b);
1505+
/// }
1506+
///
1507+
/// let expected = i16x16::new(4,-4, 5,-5, 6,-6, 7,-7, 12,-12, 13,-13, 14,-14, 15,-15);
1508+
/// assert_eq!(c, expected);
1509+
///
1510+
/// # }
1511+
/// # worker();
1512+
/// # }
1513+
/// # }
1514+
/// ```
1515+
#[inline(always)]
1516+
#[target_feature = "+avx2"]
1517+
#[cfg_attr(test, assert_instr(vpunpckhwd))]
1518+
pub unsafe fn _mm256_unpackhi_epi16(a: i16x16, b: i16x16) -> i16x16 {
1519+
simd_shuffle16(a, b, [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31])
1520+
}
1521+
1522+
/// Unpack and interleave 16-bit integers from the low half of each
1523+
/// 128-bit lane of `a` and `b`.
1524+
///
1525+
/// ```rust
1526+
/// # #![feature(cfg_target_feature)]
1527+
/// # #![feature(target_feature)]
1528+
/// #
1529+
/// # #[macro_use] extern crate stdsimd;
1530+
/// #
1531+
/// # fn main() {
1532+
/// # if cfg_feature_enabled!("avx2") {
1533+
/// # #[target_feature = "+avx2"]
1534+
/// # fn worker() {
1535+
/// use stdsimd::simd::i16x16;
1536+
/// use stdsimd::vendor::_mm256_unpacklo_epi16;
1537+
///
1538+
/// let a = i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1539+
/// let b = i16x16::new(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15);
1540+
///
1541+
/// let c: i16x16;
1542+
/// unsafe {
1543+
/// c = _mm256_unpacklo_epi16(a, b);
1544+
/// }
1545+
///
1546+
/// let expected = i16x16::new(0, 0, 1,-1, 2,-2, 3,-3, 8,-8, 9,-9, 10,-10, 11,-11);
1547+
/// assert_eq!(c, expected);
1548+
///
1549+
/// # }
1550+
/// # worker();
1551+
/// # }
1552+
/// # }
1553+
/// ```
1554+
#[inline(always)]
1555+
#[target_feature = "+avx2"]
1556+
#[cfg_attr(test, assert_instr(vpunpcklwd))]
1557+
pub unsafe fn _mm256_unpacklo_epi16(a: i16x16, b: i16x16) -> i16x16 {
1558+
simd_shuffle16(a, b, [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27])
1559+
}
1560+
1561+
/// Unpack and interleave 32-bit integers from the high half of each
1562+
/// 128-bit lane of `a` and `b`.
1563+
///
1564+
/// ```rust
1565+
/// # #![feature(cfg_target_feature)]
1566+
/// # #![feature(target_feature)]
1567+
/// #
1568+
/// # #[macro_use] extern crate stdsimd;
1569+
/// #
1570+
/// # fn main() {
1571+
/// # if cfg_feature_enabled!("avx2") {
1572+
/// # #[target_feature = "+avx2"]
1573+
/// # fn worker() {
1574+
/// use stdsimd::simd::i32x8;
1575+
/// use stdsimd::vendor::_mm256_unpackhi_epi32;
1576+
///
1577+
/// let a = i32x8::new(0, 1, 2, 3, 4, 5, 6, 7);
1578+
/// let b = i32x8::new(0,-1,-2,-3,-4,-5,-6,-7);
1579+
///
1580+
/// let c: i32x8;
1581+
/// unsafe {
1582+
/// c = _mm256_unpackhi_epi32(a, b);
1583+
/// }
1584+
///
1585+
/// let expected = i32x8::new(2,-2, 3,-3, 6,-6, 7,-7);
1586+
/// assert_eq!(c, expected);
1587+
///
1588+
/// # }
1589+
/// # worker();
1590+
/// # }
1591+
/// # }
1592+
/// ```
1593+
#[inline(always)]
1594+
#[target_feature = "+avx2"]
1595+
#[cfg_attr(test, assert_instr(vpunpckhdq))]
1596+
pub unsafe fn _mm256_unpackhi_epi32(a: i32x8, b: i32x8) -> i32x8 {
1597+
simd_shuffle8(a, b, [2, 10, 3, 11, 6, 14, 7, 15])
1598+
}
1599+
1600+
/// Unpack and interleave 32-bit integers from the low half of each
1601+
/// 128-bit lane of `a` and `b`.
1602+
///
1603+
/// ```rust
1604+
/// # #![feature(cfg_target_feature)]
1605+
/// # #![feature(target_feature)]
1606+
/// #
1607+
/// # #[macro_use] extern crate stdsimd;
1608+
/// #
1609+
/// # fn main() {
1610+
/// # if cfg_feature_enabled!("avx2") {
1611+
/// # #[target_feature = "+avx2"]
1612+
/// # fn worker() {
1613+
/// use stdsimd::simd::i32x8;
1614+
/// use stdsimd::vendor::_mm256_unpacklo_epi32;
1615+
///
1616+
/// let a = i32x8::new(0, 1, 2, 3, 4, 5, 6, 7);
1617+
/// let b = i32x8::new(0,-1,-2,-3,-4,-5,-6,-7);
1618+
///
1619+
/// let c: i32x8;
1620+
/// unsafe {
1621+
/// c = _mm256_unpacklo_epi32(a, b);
1622+
/// }
1623+
///
1624+
/// let expected = i32x8::new(0, 0, 1,-1, 4,-4, 5,-5);
1625+
/// assert_eq!(c, expected);
1626+
///
1627+
/// # }
1628+
/// # worker();
1629+
/// # }
1630+
/// # }
1631+
/// ```
1632+
#[inline(always)]
1633+
#[target_feature = "+avx2"]
1634+
#[cfg_attr(test, assert_instr(vpunpckldq))]
1635+
pub unsafe fn _mm256_unpacklo_epi32(a: i32x8, b: i32x8) -> i32x8 {
1636+
simd_shuffle8(a, b, [0, 8, 1, 9, 4, 12, 5, 13])
1637+
}
1638+
1639+
/// Unpack and interleave 64-bit integers from the high half of each
1640+
/// 128-bit lane of `a` and `b`.
1641+
///
1642+
/// ```rust
1643+
/// # #![feature(cfg_target_feature)]
1644+
/// # #![feature(target_feature)]
1645+
/// #
1646+
/// # #[macro_use] extern crate stdsimd;
1647+
/// #
1648+
/// # fn main() {
1649+
/// # if cfg_feature_enabled!("avx2") {
1650+
/// # #[target_feature = "+avx2"]
1651+
/// # fn worker() {
1652+
/// use stdsimd::simd::i64x4;
1653+
/// use stdsimd::vendor::_mm256_unpackhi_epi64;
1654+
///
1655+
/// let a = i64x4::new(0, 1, 2, 3);
1656+
/// let b = i64x4::new(0,-1,-2,-3);
1657+
///
1658+
/// let c: i64x4;
1659+
/// unsafe {
1660+
/// c = _mm256_unpackhi_epi64(a, b);
1661+
/// }
1662+
///
1663+
/// let expected = i64x4::new(1,-1, 3,-3);
1664+
/// assert_eq!(c, expected);
1665+
///
1666+
/// # }
1667+
/// # worker();
1668+
/// # }
1669+
/// # }
1670+
/// ```
1671+
#[inline(always)]
1672+
#[target_feature = "+avx2"]
1673+
#[cfg_attr(test, assert_instr(vpunpckhqdq))]
1674+
pub unsafe fn _mm256_unpackhi_epi64(a: i64x4, b: i64x4) -> i64x4 {
1675+
simd_shuffle4(a, b, [1, 5, 3, 7])
1676+
}
1677+
1678+
/// Unpack and interleave 64-bit integers from the low half of each
1679+
/// 128-bit lane of `a` and `b`.
1680+
///
1681+
/// ```rust
1682+
/// # #![feature(cfg_target_feature)]
1683+
/// # #![feature(target_feature)]
1684+
/// #
1685+
/// # #[macro_use] extern crate stdsimd;
1686+
/// #
1687+
/// # fn main() {
1688+
/// # if cfg_feature_enabled!("avx2") {
1689+
/// # #[target_feature = "+avx2"]
1690+
/// # fn worker() {
1691+
/// use stdsimd::simd::i64x4;
1692+
/// use stdsimd::vendor::_mm256_unpacklo_epi64;
1693+
///
1694+
/// let a = i64x4::new(0, 1, 2, 3);
1695+
/// let b = i64x4::new(0,-1,-2,-3);
1696+
///
1697+
/// let c: i64x4;
1698+
/// unsafe {
1699+
/// c = _mm256_unpacklo_epi64(a, b);
1700+
/// }
1701+
///
1702+
/// let expected = i64x4::new(0, 0, 2,-2);
1703+
/// assert_eq!(c, expected);
1704+
///
1705+
/// # }
1706+
/// # worker();
1707+
/// # }
1708+
/// # }
1709+
/// ```
1710+
#[inline(always)]
1711+
#[target_feature = "+avx2"]
1712+
#[cfg_attr(test, assert_instr(vpunpcklqdq))]
1713+
pub unsafe fn _mm256_unpacklo_epi64(a: i64x4, b: i64x4) -> i64x4 {
1714+
simd_shuffle4(a, b, [0, 4, 2, 6])
1715+
}
14131716

14141717
/// Compute the bitwise XOR of 256 bits (representing integer data)
14151718
/// in `a` and `b`

0 commit comments

Comments
 (0)