From f0fd6f78d48064972a5b7e73f219e0f99661b76e Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert@folkertdev.nl>
Date: Wed, 18 Feb 2026 21:28:03 +0100
Subject: [PATCH 01/19] cleanup long shuffle mask literals

---
 .../src/arm_shared/neon/generated.rs          | 240 +++---------------
 .../spec/neon/arm_shared.spec.yml             |  72 +++---
 2 files changed, 72 insertions(+), 240 deletions(-)
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs b/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs
index 45c83b880e907..13dee7a6e6e0e 100644
--- a/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs
+++ b/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs
@@ -10149,7 +10149,7 @@ pub fn vdotq_u32(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t {
 #[cfg(not(target_arch = "arm64ec"))]
 pub fn vdup_lane_f16<const N: i32>(a: float16x4_t) -> float16x4_t {
     static_assert_uimm_bits!(N, 2);
-    unsafe { simd_shuffle!(a, a, [N as u32, N as u32, N as u32, N as u32]) }
+    unsafe { simd_shuffle!(a, a, [N as u32; 4]) }
 }
 #[doc = "Set all vector lanes to the same value"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_lane_f16)"]
@@ -10174,13 +10174,7 @@ pub fn vdup_lane_f16<const N: i32>(a: float16x4_t) -> float16x4_t {
 #[cfg(not(target_arch = "arm64ec"))]
 pub fn vdupq_lane_f16<const N: i32>(a: float16x4_t) -> float16x8_t {
     static_assert_uimm_bits!(N, 2);
-    unsafe {
-        simd_shuffle!(
-            a,
-            a,
-            [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]
-        )
-    }
+    unsafe { simd_shuffle!(a, a, [N as u32; 8]) }
 }
 #[doc = "Set all vector lanes to the same value"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_lane_f32)"]
@@ -10341,7 +10335,7 @@ pub fn vdupq_lane_u32<const N: i32>(a: uint32x2_t) -> uint32x4_t {
 )]
 pub fn vdup_lane_p16<const N: i32>(a: poly16x4_t) -> poly16x4_t {
     static_assert_uimm_bits!(N, 2);
-    unsafe { simd_shuffle!(a, a, [N as u32, N as u32, N as u32, N as u32]) }
+    unsafe { simd_shuffle!(a, a, [N as u32; 4]) }
 }
 #[doc = "Set all vector lanes to the same value"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_lane_s16)"]
@@ -10364,7 +10358,7 @@ pub fn vdup_lane_p16<const N: i32>(a: poly16x4_t) -> poly16x4_t {
 )]
 pub fn vdup_lane_s16<const N: i32>(a: int16x4_t) -> int16x4_t {
     static_assert_uimm_bits!(N, 2);
-    unsafe { simd_shuffle!(a, a, [N as u32, N as u32, N as u32, N as u32]) }
+    unsafe { simd_shuffle!(a, a, [N as u32; 4]) }
 }
 #[doc = "Set all vector lanes to the same value"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_lane_u16)"]
@@ -10387,7 +10381,7 @@ pub fn vdup_lane_s16<const N: i32>(a: int16x4_t) -> int16x4_t {
 )]
 pub fn vdup_lane_u16<const N: i32>(a: uint16x4_t) -> uint16x4_t {
     static_assert_uimm_bits!(N, 2);
-    unsafe { simd_shuffle!(a, a, [N as u32, N as u32, N as u32, N as u32]) }
+    unsafe { simd_shuffle!(a, a, [N as u32; 4]) }
 }
 #[doc = "Set all vector lanes to the same value"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_lane_p16)"]
@@ -10410,13 +10404,7 @@ pub fn vdup_lane_u16<const N: i32>(a: uint16x4_t) -> uint16x4_t {
 )]
 pub fn vdupq_lane_p16<const N: i32>(a: poly16x4_t) -> poly16x8_t {
     static_assert_uimm_bits!(N, 2);
-    unsafe {
-        simd_shuffle!(
-            a,
-            a,
-            [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]
-        )
-    }
+    unsafe { simd_shuffle!(a, a, [N as u32; 8]) }
 }
 #[doc = "Set all vector lanes to the same value"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_lane_s16)"]
@@ -10439,13 +10427,7 @@ pub fn vdupq_lane_p16<const N: i32>(a: poly16x4_t) -> poly16x8_t {
 )]
 pub fn vdupq_lane_s16<const N: i32>(a: int16x4_t) -> int16x8_t {
     static_assert_uimm_bits!(N, 2);
-    unsafe {
-        simd_shuffle!(
-            a,
-            a,
-            [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]
-        )
-    }
+    unsafe { simd_shuffle!(a, a, [N as u32; 8]) }
 }
 #[doc = "Set all vector lanes to the same value"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_lane_u16)"]
@@ -10468,13 +10450,7 @@ pub fn vdupq_lane_s16<const N: i32>(a: int16x4_t) -> int16x8_t {
 )]
 pub fn vdupq_lane_u16<const N: i32>(a: uint16x4_t) -> uint16x8_t {
     static_assert_uimm_bits!(N, 2);
-    unsafe {
-        simd_shuffle!(
-            a,
-            a,
-            [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]
-        )
-    }
+    unsafe { simd_shuffle!(a, a, [N as u32; 8]) }
 }
 #[doc = "Set all vector lanes to the same value"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_lane_p8)"]
@@ -10497,13 +10473,7 @@ pub fn vdupq_lane_u16<const N: i32>(a: uint16x4_t) -> uint16x8_t {
 )]
 pub fn vdup_lane_p8<const N: i32>(a: poly8x8_t) -> poly8x8_t {
     static_assert_uimm_bits!(N, 3);
-    unsafe {
-        simd_shuffle!(
-            a,
-            a,
-            [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]
-        )
-    }
+    unsafe { simd_shuffle!(a, a, [N as u32; 8]) }
 }
 #[doc = "Set all vector lanes to the same value"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_lane_s8)"]
@@ -10526,13 +10496,7 @@ pub fn vdup_lane_p8<const N: i32>(a: poly8x8_t) -> poly8x8_t {
 )]
 pub fn vdup_lane_s8<const N: i32>(a: int8x8_t) -> int8x8_t {
     static_assert_uimm_bits!(N, 3);
-    unsafe {
-        simd_shuffle!(
-            a,
-            a,
-            [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]
-        )
-    }
+    unsafe { simd_shuffle!(a, a, [N as u32; 8]) }
 }
 #[doc = "Set all vector lanes to the same value"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_lane_u8)"]
@@ -10555,13 +10519,7 @@ pub fn vdup_lane_s8<const N: i32>(a: int8x8_t) -> int8x8_t {
 )]
 pub fn vdup_lane_u8<const N: i32>(a: uint8x8_t) -> uint8x8_t {
     static_assert_uimm_bits!(N, 3);
-    unsafe {
-        simd_shuffle!(
-            a,
-            a,
-            [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]
-        )
-    }
+    unsafe { simd_shuffle!(a, a, [N as u32; 8]) }
 }
 #[doc = "Set all vector lanes to the same value"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_lane_p8)"]
@@ -10584,16 +10542,7 @@ pub fn vdup_lane_u8<const N: i32>(a: uint8x8_t) -> uint8x8_t {
 )]
 pub fn vdupq_lane_p8<const N: i32>(a: poly8x8_t) -> poly8x16_t {
     static_assert_uimm_bits!(N, 3);
-    unsafe {
-        simd_shuffle!(
-            a,
-            a,
-            [
-                N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32,
-                N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32
-            ]
-        )
-    }
+    unsafe { simd_shuffle!(a, a, [N as u32; 16]) }
 }
 #[doc = "Set all vector lanes to the same value"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_lane_s8)"]
@@ -10616,16 +10565,7 @@ pub fn vdupq_lane_p8<const N: i32>(a: poly8x8_t) -> poly8x16_t {
 )]
 pub fn vdupq_lane_s8<const N: i32>(a: int8x8_t) -> int8x16_t {
     static_assert_uimm_bits!(N, 3);
-    unsafe {
-        simd_shuffle!(
-            a,
-            a,
-            [
-                N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32,
-                N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32
-            ]
-        )
-    }
+    unsafe { simd_shuffle!(a, a, [N as u32; 16]) }
 }
 #[doc = "Set all vector lanes to the same value"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_lane_u8)"]
@@ -10648,16 +10588,7 @@ pub fn vdupq_lane_s8<const N: i32>(a: int8x8_t) -> int8x16_t {
 )]
 pub fn vdupq_lane_u8<const N: i32>(a: uint8x8_t) -> uint8x16_t {
     static_assert_uimm_bits!(N, 3);
-    unsafe {
-        simd_shuffle!(
-            a,
-            a,
-            [
-                N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32,
-                N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32
-            ]
-        )
-    }
+    unsafe { simd_shuffle!(a, a, [N as u32; 16]) }
 }
 #[doc = "Set all vector lanes to the same value"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_lane_s64)"]
@@ -10728,7 +10659,7 @@ pub fn vdup_lane_u64<const N: i32>(a: uint64x1_t) -> uint64x1_t {
 #[cfg(not(target_arch = "arm64ec"))]
 pub fn vdup_laneq_f16<const N: i32>(a: float16x8_t) -> float16x4_t {
     static_assert_uimm_bits!(N, 3);
-    unsafe { simd_shuffle!(a, a, [N as u32, N as u32, N as u32, N as u32]) }
+    unsafe { simd_shuffle!(a, a, [N as u32; 4]) }
 }
 #[doc = "Set all vector lanes to the same value"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_laneq_f16)"]
@@ -10753,13 +10684,7 @@ pub fn vdup_laneq_f16<const N: i32>(a: float16x8_t) -> float16x4_t {
 #[cfg(not(target_arch = "arm64ec"))]
 pub fn vdupq_laneq_f16<const N: i32>(a: float16x8_t) -> float16x8_t {
     static_assert_uimm_bits!(N, 3);
-    unsafe {
-        simd_shuffle!(
-            a,
-            a,
-            [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]
-        )
-    }
+    unsafe { simd_shuffle!(a, a, [N as u32; 8]) }
 }
 #[doc = "Set all vector lanes to the same value"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_laneq_f32)"]
@@ -10920,7 +10845,7 @@ pub fn vdupq_laneq_u32<const N: i32>(a: uint32x4_t) -> uint32x4_t {
 )]
 pub fn vdup_laneq_p16<const N: i32>(a: poly16x8_t) -> poly16x4_t {
     static_assert_uimm_bits!(N, 3);
-    unsafe { simd_shuffle!(a, a, [N as u32, N as u32, N as u32, N as u32]) }
+    unsafe { simd_shuffle!(a, a, [N as u32; 4]) }
 }
 #[doc = "Set all vector lanes to the same value"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_laneq_s16)"]
@@ -10943,7 +10868,7 @@ pub fn vdup_laneq_p16<const N: i32>(a: poly16x8_t) -> poly16x4_t {
 )]
 pub fn vdup_laneq_s16<const N: i32>(a: int16x8_t) -> int16x4_t {
     static_assert_uimm_bits!(N, 3);
-    unsafe { simd_shuffle!(a, a, [N as u32, N as u32, N as u32, N as u32]) }
+    unsafe { simd_shuffle!(a, a, [N as u32; 4]) }
 }
 #[doc = "Set all vector lanes to the same value"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_laneq_u16)"]
@@ -10966,7 +10891,7 @@ pub fn vdup_laneq_s16<const N: i32>(a: int16x8_t) -> int16x4_t {
 )]
 pub fn vdup_laneq_u16<const N: i32>(a: uint16x8_t) -> uint16x4_t {
     static_assert_uimm_bits!(N, 3);
-    unsafe { simd_shuffle!(a, a, [N as u32, N as u32, N as u32, N as u32]) }
+    unsafe { simd_shuffle!(a, a, [N as u32; 4]) }
 }
 #[doc = "Set all vector lanes to the same value"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_laneq_p16)"]
@@ -10989,13 +10914,7 @@ pub fn vdup_laneq_u16<const N: i32>(a: uint16x8_t) -> uint16x4_t {
 )]
 pub fn vdupq_laneq_p16<const N: i32>(a: poly16x8_t) -> poly16x8_t {
     static_assert_uimm_bits!(N, 3);
-    unsafe {
-        simd_shuffle!(
-            a,
-            a,
-            [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]
-        )
-    }
+    unsafe { simd_shuffle!(a, a, [N as u32; 8]) }
 }
 #[doc = "Set all vector lanes to the same value"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_laneq_s16)"]
@@ -11018,13 +10937,7 @@ pub fn vdupq_laneq_p16<const N: i32>(a: poly16x8_t) -> poly16x8_t {
 )]
 pub fn vdupq_laneq_s16<const N: i32>(a: int16x8_t) -> int16x8_t {
     static_assert_uimm_bits!(N, 3);
-    unsafe {
-        simd_shuffle!(
-            a,
-            a,
-            [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]
-        )
-    }
+    unsafe { simd_shuffle!(a, a, [N as u32; 8]) }
 }
 #[doc = "Set all vector lanes to the same value"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_laneq_u16)"]
@@ -11047,13 +10960,7 @@ pub fn vdupq_laneq_s16<const N: i32>(a: int16x8_t) -> int16x8_t {
 )]
 pub fn vdupq_laneq_u16<const N: i32>(a: uint16x8_t) -> uint16x8_t {
     static_assert_uimm_bits!(N, 3);
-    unsafe {
-        simd_shuffle!(
-            a,
-            a,
-            [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]
-        )
-    }
+    unsafe { simd_shuffle!(a, a, [N as u32; 8]) }
 }
 #[doc = "Set all vector lanes to the same value"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_laneq_p8)"]
@@ -11076,13 +10983,7 @@ pub fn vdupq_laneq_u16<const N: i32>(a: uint16x8_t) -> uint16x8_t {
 )]
 pub fn vdup_laneq_p8<const N: i32>(a: poly8x16_t) -> poly8x8_t {
     static_assert_uimm_bits!(N, 4);
-    unsafe {
-        simd_shuffle!(
-            a,
-            a,
-            [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]
-        )
-    }
+    unsafe { simd_shuffle!(a, a, [N as u32; 8]) }
 }
 #[doc = "Set all vector lanes to the same value"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_laneq_s8)"]
@@ -11105,13 +11006,7 @@ pub fn vdup_laneq_p8<const N: i32>(a: poly8x16_t) -> poly8x8_t {
 )]
 pub fn vdup_laneq_s8<const N: i32>(a: int8x16_t) -> int8x8_t {
     static_assert_uimm_bits!(N, 4);
-    unsafe {
-        simd_shuffle!(
-            a,
-            a,
-            [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]
-        )
-    }
+    unsafe { simd_shuffle!(a, a, [N as u32; 8]) }
 }
 #[doc = "Set all vector lanes to the same value"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_laneq_u8)"]
@@ -11134,13 +11029,7 @@ pub fn vdup_laneq_s8<const N: i32>(a: int8x16_t) -> int8x8_t {
 )]
 pub fn vdup_laneq_u8<const N: i32>(a: uint8x16_t) -> uint8x8_t {
     static_assert_uimm_bits!(N, 4);
-    unsafe {
-        simd_shuffle!(
-            a,
-            a,
-            [N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]
-        )
-    }
+    unsafe { simd_shuffle!(a, a, [N as u32; 8]) }
 }
 #[doc = "Set all vector lanes to the same value"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_laneq_p8)"]
@@ -11163,16 +11052,7 @@ pub fn vdup_laneq_u8<const N: i32>(a: uint8x16_t) -> uint8x8_t {
 )]
 pub fn vdupq_laneq_p8<const N: i32>(a: poly8x16_t) -> poly8x16_t {
     static_assert_uimm_bits!(N, 4);
-    unsafe {
-        simd_shuffle!(
-            a,
-            a,
-            [
-                N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32,
-                N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32
-            ]
-        )
-    }
+    unsafe { simd_shuffle!(a, a, [N as u32; 16]) }
 }
 #[doc = "Set all vector lanes to the same value"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_laneq_s8)"]
@@ -11195,16 +11075,7 @@ pub fn vdupq_laneq_p8<const N: i32>(a: poly8x16_t) -> poly8x16_t {
 )]
 pub fn vdupq_laneq_s8<const N: i32>(a: int8x16_t) -> int8x16_t {
     static_assert_uimm_bits!(N, 4);
-    unsafe {
-        simd_shuffle!(
-            a,
-            a,
-            [
-                N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32,
-                N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32
-            ]
-        )
-    }
+    unsafe { simd_shuffle!(a, a, [N as u32; 16]) }
 }
 #[doc = "Set all vector lanes to the same value"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_laneq_u8)"]
@@ -11227,16 +11098,7 @@ pub fn vdupq_laneq_s8<const N: i32>(a: int8x16_t) -> int8x16_t {
 )]
 pub fn vdupq_laneq_u8<const N: i32>(a: uint8x16_t) -> uint8x16_t {
     static_assert_uimm_bits!(N, 4);
-    unsafe {
-        simd_shuffle!(
-            a,
-            a,
-            [
-                N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32,
-                N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32
-            ]
-        )
-    }
+    unsafe { simd_shuffle!(a, a, [N as u32; 16]) }
 }
 #[doc = "Set all vector lanes to the same value"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdup_laneq_s64)"]
@@ -35894,7 +35756,7 @@ pub fn vqdmulhq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
 pub fn vqdmull_lane_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int32x4_t {
     static_assert_uimm_bits!(N, 2);
     unsafe {
-        let b: int16x4_t = simd_shuffle!(b, b, [N as u32, N as u32, N as u32, N as u32]);
+        let b: int16x4_t = simd_shuffle!(b, b, [N as u32; 4]);
         vqdmull_s16(a, b)
     }
 }
@@ -35920,7 +35782,7 @@ pub fn vqdmull_lane_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int32x4_t {
 pub fn vqdmull_lane_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int64x2_t {
     static_assert_uimm_bits!(N, 1);
     unsafe {
-        let b: int32x2_t = simd_shuffle!(b, b, [N as u32, N as u32]);
+        let b: int32x2_t = simd_shuffle!(b, b, [N as u32; 2]);
         vqdmull_s32(a, b)
     }
 }
@@ -37480,17 +37342,7 @@ pub fn vqrshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
         #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnu.v8i8")]
         fn _vqrshrn_n_u16(a: uint16x8_t, n: uint16x8_t) -> uint8x8_t;
     }
-    unsafe {
-        _vqrshrn_n_u16(
-            a,
-            const {
-                uint16x8_t([
-                    -N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16,
-                    -N as u16,
-                ])
-            },
-        )
-    }
+    unsafe { _vqrshrn_n_u16(a, const { uint16x8_t([-N as u16; 8]) }) }
 }
 #[doc = "Unsigned signed saturating rounded shift right narrow"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrn_n_u32)"]
@@ -37506,12 +37358,7 @@ pub fn vqrshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
         #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnu.v4i16")]
         fn _vqrshrn_n_u32(a: uint32x4_t, n: uint32x4_t) -> uint16x4_t;
     }
-    unsafe {
-        _vqrshrn_n_u32(
-            a,
-            const { uint32x4_t([-N as u32, -N as u32, -N as u32, -N as u32]) },
-        )
-    }
+    unsafe { _vqrshrn_n_u32(a, const { uint32x4_t([-N as u32; 4]) }) }
 }
 #[doc = "Unsigned signed saturating rounded shift right narrow"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrn_n_u64)"]
@@ -37527,7 +37374,7 @@ pub fn vqrshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
         #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnu.v2i32")]
         fn _vqrshrn_n_u64(a: uint64x2_t, n: uint64x2_t) -> uint32x2_t;
     }
-    unsafe { _vqrshrn_n_u64(a, const { uint64x2_t([-N as u64, -N as u64]) }) }
+    unsafe { _vqrshrn_n_u64(a, const { uint64x2_t([-N as u64; 2]) }) }
 }
 #[doc = "Unsigned signed saturating rounded shift right narrow"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrn_n_u16)"]
@@ -38922,17 +38769,7 @@ pub fn vqshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
         #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnu.v8i8")]
         fn _vqshrn_n_u16(a: uint16x8_t, n: uint16x8_t) -> uint8x8_t;
     }
-    unsafe {
-        _vqshrn_n_u16(
-            a,
-            const {
-                uint16x8_t([
-                    -N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16,
-                    -N as u16,
-                ])
-            },
-        )
-    }
+    unsafe { _vqshrn_n_u16(a, const { uint16x8_t([-N as u16; 8]) }) }
 }
 #[doc = "Unsigned saturating shift right narrow"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrn_n_u32)"]
@@ -38948,12 +38785,7 @@ pub fn vqshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
         #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnu.v4i16")]
         fn _vqshrn_n_u32(a: uint32x4_t, n: uint32x4_t) -> uint16x4_t;
     }
-    unsafe {
-        _vqshrn_n_u32(
-            a,
-            const { uint32x4_t([-N as u32, -N as u32, -N as u32, -N as u32]) },
-        )
-    }
+    unsafe { _vqshrn_n_u32(a, const { uint32x4_t([-N as u32; 4]) }) }
 }
 #[doc = "Unsigned saturating shift right narrow"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrn_n_u64)"]
@@ -38969,7 +38801,7 @@ pub fn vqshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
         #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnu.v2i32")]
         fn _vqshrn_n_u64(a: uint64x2_t, n: uint64x2_t) -> uint32x2_t;
     }
-    unsafe { _vqshrn_n_u64(a, const { uint64x2_t([-N as u64, -N as u64]) }) }
+    unsafe { _vqshrn_n_u64(a, const { uint64x2_t([-N as u64; 2]) }) }
 }
 #[doc = "Unsigned saturating shift right narrow"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrn_n_u16)"]
diff --git a/library/stdarch/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml b/library/stdarch/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml
index 8e10fff984ac7..90cd0c80a1c18 100644
--- a/library/stdarch/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml
+++ b/library/stdarch/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml
@@ -1439,12 +1439,12 @@ intrinsics:
     static_defs: ['const N: i32']
     safety: safe
     types:
-      - [_lane_s8, int8x8_t, int8x8_t, '3', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
-      - [q_lane_s8, int8x8_t, int8x16_t, '3', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
-      - [_lane_u8, uint8x8_t, uint8x8_t, '3', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
-      - [q_lane_u8, uint8x8_t, uint8x16_t, '3', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
-      - [_lane_p8, poly8x8_t, poly8x8_t, '3', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
-      - [q_lane_p8, poly8x8_t, poly8x16_t, '3', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
+      - [_lane_s8, int8x8_t, int8x8_t, '3', '[N as u32; 8]']
+      - [q_lane_s8, int8x8_t, int8x16_t, '3', '[N as u32; 16]']
+      - [_lane_u8, uint8x8_t, uint8x8_t, '3', '[N as u32; 8]']
+      - [q_lane_u8, uint8x8_t, uint8x16_t, '3', '[N as u32; 16]']
+      - [_lane_p8, poly8x8_t, poly8x8_t, '3', '[N as u32; 8]']
+      - [q_lane_p8, poly8x8_t, poly8x16_t, '3', '[N as u32; 16]']
     compose:
       - FnCall: [static_assert_uimm_bits!, [N, "{type[3]}"]]
       - FnCall: [simd_shuffle!, [a, a, "{type[4]}"]]
@@ -1463,12 +1463,12 @@ intrinsics:
     static_defs: ['const N: i32']
     safety: safe
     types:
-      - [q_laneq_s8, int8x16_t, int8x16_t, '4', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
-      - [_laneq_s8, int8x16_t, int8x8_t, '4', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
-      - [q_laneq_u8, uint8x16_t, uint8x16_t, '4', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
-      - [_laneq_u8, uint8x16_t, uint8x8_t, '4', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
-      - [q_laneq_p8, poly8x16_t, poly8x16_t, '4', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
-      - [_laneq_p8, poly8x16_t, poly8x8_t, '4', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
+      - [q_laneq_s8, int8x16_t, int8x16_t, '4', '[N as u32; 16]']
+      - [_laneq_s8, int8x16_t, int8x8_t, '4', '[N as u32; 8]']
+      - [q_laneq_u8, uint8x16_t, uint8x16_t, '4', '[N as u32; 16]']
+      - [_laneq_u8, uint8x16_t, uint8x8_t, '4', '[N as u32; 8]']
+      - [q_laneq_p8, poly8x16_t, poly8x16_t, '4', '[N as u32; 16]']
+      - [_laneq_p8, poly8x16_t, poly8x8_t, '4', '[N as u32; 8]']
     compose:
       - FnCall: [static_assert_uimm_bits!, [N, "{type[3]}"]]
       - FnCall: [simd_shuffle!, [a, a, "{type[4]}"]]
@@ -1487,12 +1487,12 @@ intrinsics:
     static_defs: ['const N: i32']
     safety: safe
     types:
-      - [_lane_s16, int16x4_t, int16x4_t, '2', '[N as u32, N as u32, N as u32, N as u32]']
-      - [q_lane_s16, int16x4_t, int16x8_t, '2', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
-      - [_lane_u16, uint16x4_t, uint16x4_t, '2', '[N as u32, N as u32, N as u32, N as u32]']
-      - [q_lane_u16, uint16x4_t, uint16x8_t, '2', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
-      - [_lane_p16, poly16x4_t, poly16x4_t, '2', '[N as u32, N as u32, N as u32, N as u32]']
-      - [q_lane_p16, poly16x4_t, poly16x8_t, '2', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
+      - [_lane_s16, int16x4_t, int16x4_t, '2', '[N as u32; 4]']
+      - [q_lane_s16, int16x4_t, int16x8_t, '2', '[N as u32; 8]']
+      - [_lane_u16, uint16x4_t, uint16x4_t, '2', '[N as u32; 4]']
+      - [q_lane_u16, uint16x4_t, uint16x8_t, '2', '[N as u32; 8]']
+      - [_lane_p16, poly16x4_t, poly16x4_t, '2', '[N as u32; 4]']
+      - [q_lane_p16, poly16x4_t, poly16x8_t, '2', '[N as u32; 8]']
     compose:
       - FnCall: [static_assert_uimm_bits!, [N, "{type[3]}"]]
       - FnCall: [simd_shuffle!, [a, a, "{type[4]}"]]
@@ -1511,12 +1511,12 @@ intrinsics:
     static_defs: ['const N: i32']
     safety: safe
     types:
-      - [q_laneq_s16, int16x8_t, int16x8_t, '3', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
-      - [_laneq_s16, int16x8_t, int16x4_t, '3', '[N as u32, N as u32, N as u32, N as u32]']
-      - [q_laneq_u16, uint16x8_t, uint16x8_t, '3', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
-      - [_laneq_u16, uint16x8_t, uint16x4_t, '3', '[N as u32, N as u32, N as u32, N as u32]']
-      - [q_laneq_p16, poly16x8_t, poly16x8_t, '3', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
-      - [_laneq_p16, poly16x8_t, poly16x4_t, '3', '[N as u32, N as u32, N as u32, N as u32]']
+      - [q_laneq_s16, int16x8_t, int16x8_t, '3', '[N as u32; 8]']
+      - [_laneq_s16, int16x8_t, int16x4_t, '3', '[N as u32; 4]']
+      - [q_laneq_u16, uint16x8_t, uint16x8_t, '3', '[N as u32; 8]']
+      - [_laneq_u16, uint16x8_t, uint16x4_t, '3', '[N as u32; 4]']
+      - [q_laneq_p16, poly16x8_t, poly16x8_t, '3', '[N as u32; 8]']
+      - [_laneq_p16, poly16x8_t, poly16x4_t, '3', '[N as u32; 4]']
     compose:
       - FnCall: [static_assert_uimm_bits!, [N, "{type[3]}"]]
       - FnCall: [simd_shuffle!, [a, a, "{type[4]}"]]
@@ -1538,8 +1538,8 @@ intrinsics:
     static_defs: ['const N: i32']
     safety: safe
     types:
-      - [q_laneq_f16, float16x8_t, float16x8_t, '3', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
-      - [_laneq_f16, float16x8_t, float16x4_t, '3', '[N as u32, N as u32, N as u32, N as u32]']
+      - [q_laneq_f16, float16x8_t, float16x8_t, '3', '[N as u32; 8]']
+      - [_laneq_f16, float16x8_t, float16x4_t, '3', '[N as u32; 4]']
     compose:
       - FnCall: [static_assert_uimm_bits!, [N, "{type[3]}"]]
       - FnCall: [simd_shuffle!, [a, a, "{type[4]}"]]
@@ -1578,8 +1578,8 @@ intrinsics:
     static_defs: ['const N: i32']
     safety: safe
     types:
-      - [_lane_f16, float16x4_t, float16x4_t, '2', '[N as u32, N as u32, N as u32, N as u32]']
-      - [q_lane_f16, float16x4_t, float16x8_t, '2', '[N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32, N as u32]']
+      - [_lane_f16, float16x4_t, float16x4_t, '2', '[N as u32; 4]']
+      - [q_lane_f16, float16x4_t, float16x8_t, '2', '[N as u32; 8]']
     compose:
       - FnCall: [static_assert_uimm_bits!, [N, "{type[3]}"]]
       - FnCall: [simd_shuffle!, [a, a, "{type[4]}"]]
@@ -7675,7 +7675,7 @@ intrinsics:
     static_defs: ['const N: i32']
     safety: safe
     types:
-      - [int16x4_t, int16x4_t, int32x4_t, '[N as u32, N as u32, N as u32, N as u32]']
+      - [int16x4_t, int16x4_t, int32x4_t, '[N as u32; 4]']
     compose:
       - FnCall: [static_assert_uimm_bits!, [N, '2']]
       - Let: [b, "{neon_type[0]}", {FnCall: [simd_shuffle!, [b, b, "{type[3]}"]]}]
@@ -7695,7 +7695,7 @@ intrinsics:
     static_defs: ['const N: i32']
     safety: safe
     types:
-      - [int32x2_t, int32x2_t, int64x2_t, '[N as u32, N as u32]']
+      - [int32x2_t, int32x2_t, int64x2_t, '[N as u32; 2]']
     compose:
       - FnCall: [static_assert_uimm_bits!, [N, '1']]
       - Let: [b, "{neon_type[0]}", {FnCall: [simd_shuffle!, [b, b, "{type[3]}"]]}]
@@ -8320,9 +8320,9 @@ intrinsics:
     static_defs: ['const N: i32']
     safety: safe
     types:
-      - [uint16x8_t, uint8x8_t, 'N >= 1 && N <= 8', 'const { uint16x8_t([-N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16]) }']
-      - [uint32x4_t, uint16x4_t, 'N >= 1 && N <= 16', 'const { uint32x4_t([-N as u32, -N as u32, -N as u32, -N as u32]) }']
-      - [uint64x2_t, uint32x2_t, 'N >= 1 && N <= 32', 'const { uint64x2_t([-N as u64, -N as u64]) }']
+      - [uint16x8_t, uint8x8_t, 'N >= 1 && N <= 8', 'const { uint16x8_t([-N as u16; 8]) }']
+      - [uint32x4_t, uint16x4_t, 'N >= 1 && N <= 16', 'const { uint32x4_t([-N as u32; 4]) }']
+      - [uint64x2_t, uint32x2_t, 'N >= 1 && N <= 32', 'const { uint64x2_t([-N as u64; 2]) }']
     compose:
       - FnCall: [static_assert!, ["{type[2]}"]]
       - LLVMLink:
@@ -10789,9 +10789,9 @@ intrinsics:
     static_defs: ['const N: i32']
     safety: safe
     types:
-      - [uint16x8_t, uint8x8_t, '8', 'const { uint16x8_t([-N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16]) }']
-      - [uint32x4_t, uint16x4_t, '16', 'const { uint32x4_t([-N as u32, -N as u32, -N as u32, -N as u32]) }']
-      - [uint64x2_t, uint32x2_t, '32', 'const { uint64x2_t([-N as u64, -N as u64]) }']
+      - [uint16x8_t, uint8x8_t, '8', 'const { uint16x8_t([-N as u16; 8]) }']
+      - [uint32x4_t, uint16x4_t, '16', 'const { uint32x4_t([-N as u32; 4]) }']
+      - [uint64x2_t, uint32x2_t, '32', 'const { uint64x2_t([-N as u64; 2]) }']
     compose:
       - FnCall: [static_assert!, ['N >= 1 && N <= {type[2]}']]
       - LLVMLink:

From c0c9d025d5a0edcc34bf58645a74e76d4f6b5456 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert@folkertdev.nl>
Date: Wed, 18 Feb 2026 21:23:00 +0100
Subject: [PATCH 02/19] use `intrinsics::simd` for interleaving store

---
 .../src/arm_shared/neon/generated.rs          | 192 +++---------------
 .../stdarch/crates/core_arch/src/macros.rs    |  67 ++++++
 .../spec/neon/arm_shared.spec.yml             |  86 +++-----
 3 files changed, 118 insertions(+), 227 deletions(-)

diff --git a/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs b/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs
index 45c83b880e907..37c7ef8fea887 100644
--- a/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs
+++ b/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs
@@ -66001,14 +66001,7 @@ pub unsafe fn vst2q_f16(a: *mut f16, b: float16x8x2_t) {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(st2))]
 pub unsafe fn vst2_f32(a: *mut f32, b: float32x2x2_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.st2.v2f32.p0"
-        )]
-        fn _vst2_f32(a: float32x2_t, b: float32x2_t, ptr: *mut i8);
-    }
-    _vst2_f32(b.0, b.1, a as _)
+    crate::core_arch::macros::interleaving_store!(f32, 2, 2, a, b)
 }
 #[doc = "Store multiple 2-element structures from two registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_f32)"]
@@ -66020,14 +66013,7 @@ pub unsafe fn vst2_f32(a: *mut f32, b: float32x2x2_t) {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(st2))]
 pub unsafe fn vst2q_f32(a: *mut f32, b: float32x4x2_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.st2.v4f32.p0"
-        )]
-        fn _vst2q_f32(a: float32x4_t, b: float32x4_t, ptr: *mut i8);
-    }
-    _vst2q_f32(b.0, b.1, a as _)
+    crate::core_arch::macros::interleaving_store!(f32, 4, 2, a, b)
 }
 #[doc = "Store multiple 2-element structures from two registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_s8)"]
@@ -66039,14 +66025,7 @@ pub unsafe fn vst2q_f32(a: *mut f32, b: float32x4x2_t) {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(st2))]
 pub unsafe fn vst2_s8(a: *mut i8, b: int8x8x2_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.st2.v8i8.p0"
-        )]
-        fn _vst2_s8(a: int8x8_t, b: int8x8_t, ptr: *mut i8);
-    }
-    _vst2_s8(b.0, b.1, a as _)
+    crate::core_arch::macros::interleaving_store!(i8, 8, 2, a, b)
 }
 #[doc = "Store multiple 2-element structures from two registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_s8)"]
@@ -66058,14 +66037,7 @@ pub unsafe fn vst2_s8(a: *mut i8, b: int8x8x2_t) {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(st2))]
 pub unsafe fn vst2q_s8(a: *mut i8, b: int8x16x2_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.st2.v16i8.p0"
-        )]
-        fn _vst2q_s8(a: int8x16_t, b: int8x16_t, ptr: *mut i8);
-    }
-    _vst2q_s8(b.0, b.1, a as _)
+    crate::core_arch::macros::interleaving_store!(i8, 16, 2, a, b)
 }
 #[doc = "Store multiple 2-element structures from two registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_s16)"]
@@ -66077,14 +66049,7 @@ pub unsafe fn vst2q_s8(a: *mut i8, b: int8x16x2_t) {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(st2))]
 pub unsafe fn vst2_s16(a: *mut i16, b: int16x4x2_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.st2.v4i16.p0"
-        )]
-        fn _vst2_s16(a: int16x4_t, b: int16x4_t, ptr: *mut i8);
-    }
-    _vst2_s16(b.0, b.1, a as _)
+    crate::core_arch::macros::interleaving_store!(i16, 4, 2, a, b)
 }
 #[doc = "Store multiple 2-element structures from two registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_s16)"]
@@ -66096,14 +66061,7 @@ pub unsafe fn vst2_s16(a: *mut i16, b: int16x4x2_t) {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(st2))]
 pub unsafe fn vst2q_s16(a: *mut i16, b: int16x8x2_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.st2.v8i16.p0"
-        )]
-        fn _vst2q_s16(a: int16x8_t, b: int16x8_t, ptr: *mut i8);
-    }
-    _vst2q_s16(b.0, b.1, a as _)
+    crate::core_arch::macros::interleaving_store!(i16, 8, 2, a, b)
 }
 #[doc = "Store multiple 2-element structures from two registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_s32)"]
@@ -66115,14 +66073,7 @@ pub unsafe fn vst2q_s16(a: *mut i16, b: int16x8x2_t) {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(st2))]
 pub unsafe fn vst2_s32(a: *mut i32, b: int32x2x2_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.st2.v2i32.p0"
-        )]
-        fn _vst2_s32(a: int32x2_t, b: int32x2_t, ptr: *mut i8);
-    }
-    _vst2_s32(b.0, b.1, a as _)
+    crate::core_arch::macros::interleaving_store!(i32, 2, 2, a, b)
 }
 #[doc = "Store multiple 2-element structures from two registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_s32)"]
@@ -66134,14 +66085,7 @@ pub unsafe fn vst2_s32(a: *mut i32, b: int32x2x2_t) {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(st2))]
 pub unsafe fn vst2q_s32(a: *mut i32, b: int32x4x2_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.st2.v4i32.p0"
-        )]
-        fn _vst2q_s32(a: int32x4_t, b: int32x4_t, ptr: *mut i8);
-    }
-    _vst2q_s32(b.0, b.1, a as _)
+    crate::core_arch::macros::interleaving_store!(i32, 4, 2, a, b)
 }
 #[doc = "Store multiple 2-element structures from two registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_f32)"]
@@ -67233,11 +67177,7 @@ pub unsafe fn vst3q_f16(a: *mut f16, b: float16x8x3_t) {
 #[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
 #[cfg_attr(test, assert_instr(vst3))]
 pub unsafe fn vst3_f32(a: *mut f32, b: float32x2x3_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0.v2f32")]
-        fn _vst3_f32(ptr: *mut i8, a: float32x2_t, b: float32x2_t, c: float32x2_t, size: i32);
-    }
-    _vst3_f32(a as _, b.0, b.1, b.2, 4)
+    crate::core_arch::macros::interleaving_store!(f32, 2, 3, a, b)
 }
 #[doc = "Store multiple 3-element structures from three registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_f32)"]
@@ -67249,11 +67189,7 @@ pub unsafe fn vst3_f32(a: *mut f32, b: float32x2x3_t) {
 #[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
 #[cfg_attr(test, assert_instr(vst3))]
 pub unsafe fn vst3q_f32(a: *mut f32, b: float32x4x3_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0.v4f32")]
-        fn _vst3q_f32(ptr: *mut i8, a: float32x4_t, b: float32x4_t, c: float32x4_t, size: i32);
-    }
-    _vst3q_f32(a as _, b.0, b.1, b.2, 4)
+    crate::core_arch::macros::interleaving_store!(f32, 4, 3, a, b)
 }
 #[doc = "Store multiple 3-element structures from three registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_s8)"]
@@ -67265,11 +67201,7 @@ pub unsafe fn vst3q_f32(a: *mut f32, b: float32x4x3_t) {
 #[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
 #[cfg_attr(test, assert_instr(vst3))]
 pub unsafe fn vst3_s8(a: *mut i8, b: int8x8x3_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0.v8i8")]
-        fn _vst3_s8(ptr: *mut i8, a: int8x8_t, b: int8x8_t, c: int8x8_t, size: i32);
-    }
-    _vst3_s8(a as _, b.0, b.1, b.2, 1)
+    crate::core_arch::macros::interleaving_store!(i8, 8, 3, a, b)
 }
 #[doc = "Store multiple 3-element structures from three registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_s8)"]
@@ -67281,11 +67213,7 @@ pub unsafe fn vst3_s8(a: *mut i8, b: int8x8x3_t) {
 #[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
 #[cfg_attr(test, assert_instr(vst3))]
 pub unsafe fn vst3q_s8(a: *mut i8, b: int8x16x3_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0.v16i8")]
-        fn _vst3q_s8(ptr: *mut i8, a: int8x16_t, b: int8x16_t, c: int8x16_t, size: i32);
-    }
-    _vst3q_s8(a as _, b.0, b.1, b.2, 1)
+    crate::core_arch::macros::interleaving_store!(i8, 16, 3, a, b)
 }
 #[doc = "Store multiple 3-element structures from three registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_s16)"]
@@ -67297,11 +67225,7 @@ pub unsafe fn vst3q_s8(a: *mut i8, b: int8x16x3_t) {
 #[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
 #[cfg_attr(test, assert_instr(vst3))]
 pub unsafe fn vst3_s16(a: *mut i16, b: int16x4x3_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0.v4i16")]
-        fn _vst3_s16(ptr: *mut i8, a: int16x4_t, b: int16x4_t, c: int16x4_t, size: i32);
-    }
-    _vst3_s16(a as _, b.0, b.1, b.2, 2)
+    crate::core_arch::macros::interleaving_store!(i16, 4, 3, a, b)
 }
 #[doc = "Store multiple 3-element structures from three registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_s16)"]
@@ -67313,11 +67237,7 @@ pub unsafe fn vst3_s16(a: *mut i16, b: int16x4x3_t) {
 #[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
 #[cfg_attr(test, assert_instr(vst3))]
 pub unsafe fn vst3q_s16(a: *mut i16, b: int16x8x3_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0.v8i16")]
-        fn _vst3q_s16(ptr: *mut i8, a: int16x8_t, b: int16x8_t, c: int16x8_t, size: i32);
-    }
-    _vst3q_s16(a as _, b.0, b.1, b.2, 2)
+    crate::core_arch::macros::interleaving_store!(i16, 8, 3, a, b)
 }
 #[doc = "Store multiple 3-element structures from three registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_s32)"]
@@ -67329,11 +67249,7 @@ pub unsafe fn vst3q_s16(a: *mut i16, b: int16x8x3_t) {
 #[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
 #[cfg_attr(test, assert_instr(vst3))]
 pub unsafe fn vst3_s32(a: *mut i32, b: int32x2x3_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0.v2i32")]
-        fn _vst3_s32(ptr: *mut i8, a: int32x2_t, b: int32x2_t, c: int32x2_t, size: i32);
-    }
-    _vst3_s32(a as _, b.0, b.1, b.2, 4)
+    crate::core_arch::macros::interleaving_store!(i32, 2, 3, a, b)
 }
 #[doc = "Store multiple 3-element structures from three registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_s32)"]
@@ -67345,11 +67261,7 @@ pub unsafe fn vst3_s32(a: *mut i32, b: int32x2x3_t) {
 #[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
 #[cfg_attr(test, assert_instr(vst3))]
 pub unsafe fn vst3q_s32(a: *mut i32, b: int32x4x3_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0.v4i32")]
-        fn _vst3q_s32(ptr: *mut i8, a: int32x4_t, b: int32x4_t, c: int32x4_t, size: i32);
-    }
-    _vst3q_s32(a as _, b.0, b.1, b.2, 4)
+    crate::core_arch::macros::interleaving_store!(i32, 4, 3, a, b)
 }
 #[doc = "Store multiple 3-element structures from three registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_f32)"]
@@ -68712,14 +68624,7 @@ pub unsafe fn vst4q_s32(a: *mut i32, b: int32x4x4_t) {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(st4))]
 pub unsafe fn vst4_f32(a: *mut f32, b: float32x2x4_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.st4.v2f32.p0"
-        )]
-        fn _vst4_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t, d: float32x2_t, ptr: *mut i8);
-    }
-    _vst4_f32(b.0, b.1, b.2, b.3, a as _)
+    crate::core_arch::macros::interleaving_store!(f32, 2, 4, a, b)
 }
 #[doc = "Store multiple 4-element structures from four registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_f32)"]
@@ -68731,14 +68636,7 @@ pub unsafe fn vst4_f32(a: *mut f32, b: float32x2x4_t) {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(st4))]
 pub unsafe fn vst4q_f32(a: *mut f32, b: float32x4x4_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.st4.v4f32.p0"
-        )]
-        fn _vst4q_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t, d: float32x4_t, ptr: *mut i8);
-    }
-    _vst4q_f32(b.0, b.1, b.2, b.3, a as _)
+    crate::core_arch::macros::interleaving_store!(f32, 4, 4, a, b)
 }
 #[doc = "Store multiple 4-element structures from four registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_s8)"]
@@ -68750,14 +68648,7 @@ pub unsafe fn vst4q_f32(a: *mut f32, b: float32x4x4_t) {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(st4))]
 pub unsafe fn vst4_s8(a: *mut i8, b: int8x8x4_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.st4.v8i8.p0"
-        )]
-        fn _vst4_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t, ptr: *mut i8);
-    }
-    _vst4_s8(b.0, b.1, b.2, b.3, a as _)
+    crate::core_arch::macros::interleaving_store!(i8, 8, 4, a, b)
 }
 #[doc = "Store multiple 4-element structures from four registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_s8)"]
@@ -68769,14 +68660,7 @@ pub unsafe fn vst4_s8(a: *mut i8, b: int8x8x4_t) {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(st4))]
 pub unsafe fn vst4q_s8(a: *mut i8, b: int8x16x4_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.st4.v16i8.p0"
-        )]
-        fn _vst4q_s8(a: int8x16_t, b: int8x16_t, c: int8x16_t, d: int8x16_t, ptr: *mut i8);
-    }
-    _vst4q_s8(b.0, b.1, b.2, b.3, a as _)
+    crate::core_arch::macros::interleaving_store!(i8, 16, 4, a, b)
 }
 #[doc = "Store multiple 4-element structures from four registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_s16)"]
@@ -68788,14 +68672,7 @@ pub unsafe fn vst4q_s8(a: *mut i8, b: int8x16x4_t) {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(st4))]
 pub unsafe fn vst4_s16(a: *mut i16, b: int16x4x4_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.st4.v4i16.p0"
-        )]
-        fn _vst4_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t, d: int16x4_t, ptr: *mut i8);
-    }
-    _vst4_s16(b.0, b.1, b.2, b.3, a as _)
+    crate::core_arch::macros::interleaving_store!(i16, 4, 4, a, b)
 }
 #[doc = "Store multiple 4-element structures from four registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_s16)"]
@@ -68807,14 +68684,7 @@ pub unsafe fn vst4_s16(a: *mut i16, b: int16x4x4_t) {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(st4))]
 pub unsafe fn vst4q_s16(a: *mut i16, b: int16x8x4_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.st4.v8i16.p0"
-        )]
-        fn _vst4q_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t, d: int16x8_t, ptr: *mut i8);
-    }
-    _vst4q_s16(b.0, b.1, b.2, b.3, a as _)
+    crate::core_arch::macros::interleaving_store!(i16, 8, 4, a, b)
 }
 #[doc = "Store multiple 4-element structures from four registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_s32)"]
@@ -68826,14 +68696,7 @@ pub unsafe fn vst4q_s16(a: *mut i16, b: int16x8x4_t) {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(st4))]
 pub unsafe fn vst4_s32(a: *mut i32, b: int32x2x4_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.st4.v2i32.p0"
-        )]
-        fn _vst4_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t, d: int32x2_t, ptr: *mut i8);
-    }
-    _vst4_s32(b.0, b.1, b.2, b.3, a as _)
+    crate::core_arch::macros::interleaving_store!(i32, 2, 4, a, b)
 }
 #[doc = "Store multiple 4-element structures from four registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_s32)"]
@@ -68845,14 +68708,7 @@ pub unsafe fn vst4_s32(a: *mut i32, b: int32x2x4_t) {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(st4))]
 pub unsafe fn vst4q_s32(a: *mut i32, b: int32x4x4_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.st4.v4i32.p0"
-        )]
-        fn _vst4q_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t, d: int32x4_t, ptr: *mut i8);
-    }
-    _vst4q_s32(b.0, b.1, b.2, b.3, a as _)
+    crate::core_arch::macros::interleaving_store!(i32, 4, 4, a, b)
 }
 #[doc = "Store multiple 4-element structures from four registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_lane_f16)"]
diff --git a/library/stdarch/crates/core_arch/src/macros.rs b/library/stdarch/crates/core_arch/src/macros.rs
index 9f6922efeeb7d..5a582fe17772b 100644
--- a/library/stdarch/crates/core_arch/src/macros.rs
+++ b/library/stdarch/crates/core_arch/src/macros.rs
@@ -187,6 +187,17 @@ macro_rules! simd_masked_store {
     };
 }
 
+/// The first N indices `[0, 1, 2, ...]`.
+pub(crate) const fn identity<const N: usize>() -> [u32; N] {
+    let mut out = [0u32; N];
+    let mut i = 0usize;
+    while i < N {
+        out[i] = i as u32;
+        i += 1;
+    }
+    out
+}
+
 /// The first N even indices `[0, 2, 4, ...]`.
 pub(crate) const fn even<const N: usize>() -> [u32; N] {
     let mut out = [0u32; N];
@@ -277,3 +288,59 @@ macro_rules! deinterleaving_load {
 
 #[allow(unused)]
 pub(crate) use deinterleaving_load;
+
+pub(crate) const fn interleave_mask<const LANES: usize, const N: usize, const K: usize>()
+-> [u32; LANES] {
+    let mut out = [0u32; LANES];
+    let mut j = 0usize;
+    while j < LANES {
+        out[j] = ((j % K) * N + j / K) as u32;
+        j += 1;
+    }
+    out
+}
+
+#[allow(unused)]
+macro_rules! interleaving_store {
+    ($elem:ty, $lanes:literal, 2, $ptr:expr, $v:expr) => {{
+        use $crate::core_arch::macros::interleave_mask;
+        use $crate::core_arch::simd::Simd;
+
+        type W = Simd<$elem, { $lanes * 2 }>;
+        let w: W = simd_shuffle!($v.0, $v.1, interleave_mask::<{ $lanes * 2 }, $lanes, 2>());
+        $crate::ptr::write_unaligned($ptr as *mut W, w);
+    }};
+
+    // N = 3
+    ($elem:ty, $lanes:literal, 3, $ptr:expr, $v:expr) => {{
+        use $crate::core_arch::macros::{identity, interleave_mask};
+        use $crate::core_arch::simd::Simd;
+
+        let v0v1: Simd<$elem, { $lanes * 2 }> =
+            simd_shuffle!($v.0, $v.1, identity::<{ $lanes * 2 }>());
+        let v2v2: Simd<$elem, { $lanes * 2 }> =
+            simd_shuffle!($v.2, $v.2, identity::<{ $lanes * 2 }>());
+
+        type W = Simd<$elem, { $lanes * 3 }>;
+        let w: W = simd_shuffle!(v0v1, v2v2, interleave_mask::<{ $lanes * 3 }, $lanes, 3>());
+        $crate::ptr::write_unaligned($ptr as *mut W, w);
+    }};
+
+    // N = 4
+    ($elem:ty, $lanes:literal, 4, $ptr:expr, $v:expr) => {{
+        use $crate::core_arch::macros::{identity, interleave_mask};
+        use $crate::core_arch::simd::Simd;
+
+        let v0v1: Simd<$elem, { $lanes * 2 }> =
+            simd_shuffle!($v.0, $v.1, identity::<{ $lanes * 2 }>());
+        let v2v3: Simd<$elem, { $lanes * 2 }> =
+            simd_shuffle!($v.2, $v.3, identity::<{ $lanes * 2 }>());
+
+        type W = Simd<$elem, { $lanes * 4 }>;
+        let w: W = simd_shuffle!(v0v1, v2v3, interleave_mask::<{ $lanes * 4 }, $lanes, 4>());
+        $crate::ptr::write_unaligned($ptr as *mut W, w);
+    }};
+}
+
+#[allow(unused)]
+pub(crate) use interleaving_store;
diff --git a/library/stdarch/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml b/library/stdarch/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml
index 8e10fff984ac7..f890b39f071d2 100644
--- a/library/stdarch/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml
+++ b/library/stdarch/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml
@@ -5113,26 +5113,16 @@ intrinsics:
     safety:
       unsafe: [neon]
     types:
-      - [i8, int8x8x2_t, int8x8_t]
-      - [i16, int16x4x2_t, int16x4_t]
-      - [i32, int32x2x2_t, int32x2_t]
-      - [i8, int8x16x2_t, int8x16_t]
-      - [i16, int16x8x2_t, int16x8_t]
-      - [i32, int32x4x2_t, int32x4_t]
-      - [f32, float32x2x2_t, float32x2_t]
-      - [f32, float32x4x2_t, float32x4_t]
+      - [i8, int8x8x2_t, "8"]
+      - [i16, int16x4x2_t, "4"]
+      - [i32, int32x2x2_t, "2"]
+      - [i8, int8x16x2_t, "16"]
+      - [i16, int16x8x2_t, "8"]
+      - [i32, int32x4x2_t, "4"]
+      - [f32, float32x2x2_t, "2"]
+      - [f32, float32x4x2_t, "4"]
     compose:
-      - LLVMLink:
-          name: 'st2.{neon_type[1]}'
-          arguments:
-            - 'a: {type[2]}'
-            - 'b: {type[2]}'
-            - 'ptr: *mut i8'
-          links:
-            - link: 'llvm.aarch64.neon.st2.v{neon_type[1].lane}{type[0]}.p0'
-              arch: aarch64,arm64ec
-      - FnCall: ['_vst2{neon_type[1].nox}', ['b.0', 'b.1', 'a as _']]
-
+      - FnCall: ["crate::core_arch::macros::interleaving_store!", [{ Type: "{type[0]}" }, "{type[2]}", "2", a, b], [], true]
 
   - name: "vst2{neon_type[1].nox}"
     doc: "Store multiple 2-element structures from two registers"
@@ -5571,27 +5561,16 @@ intrinsics:
     safety:
       unsafe: [neon]
     types:
-      - [i8, int8x8x3_t, int8x8_t, '1']
-      - [i16, int16x4x3_t, int16x4_t, '2']
-      - [i32, int32x2x3_t, int32x2_t, '4']
-      - [i8, int8x16x3_t, int8x16_t, '1']
-      - [i16, int16x8x3_t, int16x8_t, '2']
-      - [i32, int32x4x3_t, int32x4_t, '4']
-      - [f32, float32x2x3_t, float32x2_t, '4']
-      - [f32, float32x4x3_t, float32x4_t, '4']
+      - [i8, int8x8x3_t, '8']
+      - [i16, int16x4x3_t, '4']
+      - [i32, int32x2x3_t, '2']
+      - [i8, int8x16x3_t, '16']
+      - [i16, int16x8x3_t, '8']
+      - [i32, int32x4x3_t, '4']
+      - [f32, float32x2x3_t, '2']
+      - [f32, float32x4x3_t, '4']
     compose:
-      - LLVMLink:
-          name: 'vst3.{neon_type[1]}'
-          arguments:
-            - 'ptr: *mut i8'
-            - 'a: {type[2]}'
-            - 'b: {type[2]}'
-            - 'c: {type[2]}'
-            - 'size: i32'
-          links:
-            - link: 'llvm.arm.neon.vst3.p0.v{neon_type[1].lane}{type[0]}'
-              arch: arm
-      - FnCall: ['_vst3{neon_type[1].nox}', ['a as _', 'b.0', 'b.1', 'b.2', "{type[3]}"]]
+      - FnCall: ["crate::core_arch::macros::interleaving_store!", [{ Type: "{type[0]}" }, "{type[2]}", "3", a, b], [], true]
 
 
   - name: "vst3{neon_type[1].nox}"
@@ -6114,27 +6093,16 @@ intrinsics:
     safety:
       unsafe: [neon]
     types:
-      - [i8, int8x8x4_t, int8x8_t]
-      - [i16, int16x4x4_t, int16x4_t]
-      - [i32, int32x2x4_t, int32x2_t]
-      - [i8, int8x16x4_t, int8x16_t]
-      - [i16, int16x8x4_t, int16x8_t]
-      - [i32, int32x4x4_t, int32x4_t]
-      - [f32, float32x2x4_t, float32x2_t]
-      - [f32, float32x4x4_t, float32x4_t]
+      - [i8, int8x8x4_t, "8"]
+      - [i16, int16x4x4_t, "4"]
+      - [i32, int32x2x4_t, "2"]
+      - [i8, int8x16x4_t, "16"]
+      - [i16, int16x8x4_t, "8"]
+      - [i32, int32x4x4_t, "4"]
+      - [f32, float32x2x4_t, "2"]
+      - [f32, float32x4x4_t, "4"]
     compose:
-      - LLVMLink:
-          name: 'vst4.{neon_type[1]}'
-          arguments:
-            - 'a: {type[2]}'
-            - 'b: {type[2]}'
-            - 'c: {type[2]}'
-            - 'd: {type[2]}'
-            - 'ptr: *mut i8'
-          links:
-            - link: 'llvm.aarch64.neon.st4.v{neon_type[1].lane}{type[0]}.p0'
-              arch: aarch64,arm64ec
-      - FnCall: ['_vst4{neon_type[1].nox}', ['b.0', 'b.1', 'b.2', 'b.3', 'a as _']]
+      - FnCall: ["crate::core_arch::macros::interleaving_store!", [{ Type: "{type[0]}" }, "{type[2]}", "4", a, b], [], true]
 
 
   - name: "vst4{neon_type[1].nox}"

From e219383c8f00a56d61e3101ac4feca383c4ecaca Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert@folkertdev.nl>
Date: Wed, 18 Feb 2026 21:40:50 +0100
Subject: [PATCH 03/19] use `intrinsics::simd` for interleaving store of
 `int64x1`

---
 .../src/arm_shared/neon/generated.rs          | 52 ++-----------
 .../spec/neon/arm_shared.spec.yml             | 75 ++-----------------
 2 files changed, 12 insertions(+), 115 deletions(-)

diff --git a/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs b/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs
index 37c7ef8fea887..62201edfdaf25 100644
--- a/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs
+++ b/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs
@@ -66809,11 +66809,7 @@ pub unsafe fn vst2_p64(a: *mut p64, b: poly64x1x2_t) {
 #[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
 #[cfg_attr(test, assert_instr(nop))]
 pub unsafe fn vst2_s64(a: *mut i64, b: int64x1x2_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2.v1i64.p0")]
-        fn _vst2_s64(ptr: *mut i8, a: int64x1_t, b: int64x1_t, size: i32);
-    }
-    _vst2_s64(a as _, b.0, b.1, 8)
+    core::ptr::write_unaligned(a.cast(), b)
 }
 #[doc = "Store multiple 2-element structures from two registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_s64)"]
@@ -66825,14 +66821,7 @@ pub unsafe fn vst2_s64(a: *mut i64, b: int64x1x2_t) {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(nop))]
 pub unsafe fn vst2_s64(a: *mut i64, b: int64x1x2_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.st2.v1i64.p0"
-        )]
-        fn _vst2_s64(a: int64x1_t, b: int64x1_t, ptr: *mut i8);
-    }
-    _vst2_s64(b.0, b.1, a as _)
+    core::ptr::write_unaligned(a.cast(), b)
 }
 #[doc = "Store multiple 2-element structures from two registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_u64)"]
@@ -68065,14 +68054,7 @@ pub unsafe fn vst3_p64(a: *mut p64, b: poly64x1x3_t) {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(nop))]
 pub unsafe fn vst3_s64(a: *mut i64, b: int64x1x3_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.st3.v1i64.p0"
-        )]
-        fn _vst3_s64(a: int64x1_t, b: int64x1_t, c: int64x1_t, ptr: *mut i8);
-    }
-    _vst3_s64(b.0, b.1, b.2, a as _)
+    core::ptr::write_unaligned(a.cast(), b)
 }
 #[doc = "Store multiple 3-element structures from three registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_s64)"]
@@ -68084,11 +68066,7 @@ pub unsafe fn vst3_s64(a: *mut i64, b: int64x1x3_t) {
 #[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
 #[cfg_attr(test, assert_instr(nop))]
 pub unsafe fn vst3_s64(a: *mut i64, b: int64x1x3_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0.v1i64")]
-        fn _vst3_s64(ptr: *mut i8, a: int64x1_t, b: int64x1_t, c: int64x1_t, size: i32);
-    }
-    _vst3_s64(a as _, b.0, b.1, b.2, 8)
+    core::ptr::write_unaligned(a.cast(), b)
 }
 #[doc = "Store multiple 3-element structures from three registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_u64)"]
@@ -69432,18 +69410,7 @@ pub unsafe fn vst4_p64(a: *mut p64, b: poly64x1x4_t) {
 #[unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")]
 #[cfg_attr(test, assert_instr(nop))]
 pub unsafe fn vst4_s64(a: *mut i64, b: int64x1x4_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4.p0.v1i64")]
-        fn _vst4_s64(
-            ptr: *mut i8,
-            a: int64x1_t,
-            b: int64x1_t,
-            c: int64x1_t,
-            d: int64x1_t,
-            size: i32,
-        );
-    }
-    _vst4_s64(a as _, b.0, b.1, b.2, b.3, 8)
+    core::ptr::write_unaligned(a.cast(), b)
 }
 #[doc = "Store multiple 4-element structures from four registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_s64)"]
@@ -69455,14 +69422,7 @@ pub unsafe fn vst4_s64(a: *mut i64, b: int64x1x4_t) {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(nop))]
 pub unsafe fn vst4_s64(a: *mut i64, b: int64x1x4_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.st4.v1i64.p0"
-        )]
-        fn _vst4_s64(a: int64x1_t, b: int64x1_t, c: int64x1_t, d: int64x1_t, ptr: *mut i8);
-    }
-    _vst4_s64(b.0, b.1, b.2, b.3, a as _)
+    core::ptr::write_unaligned(a.cast(), b)
 }
 #[doc = "Store multiple 4-element structures from four registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_u64)"]
diff --git a/library/stdarch/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml b/library/stdarch/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml
index f890b39f071d2..23145d6d6692e 100644
--- a/library/stdarch/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml
+++ b/library/stdarch/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml
@@ -5049,17 +5049,7 @@ intrinsics:
     types:
       - [i64, int64x1x2_t, int64x1_t]
     compose:
-      - LLVMLink:
-          name: 'vst2.{neon_type[1]}'
-          arguments:
-            - 'ptr: *mut i8'
-            - 'a: {type[2]}'
-            - 'b: {type[2]}'
-            - 'size: i32'
-          links:
-            - link: 'llvm.arm.neon.vst2.v{neon_type[1].lane}{type[0]}.p0'
-              arch: arm
-      - FnCall: ['_vst2{neon_type[1].nox}', ['a as _', 'b.0', 'b.1', '8']]
+      - FnCall: [core::ptr::write_unaligned, ['a.cast()', b]]
 
   - name: "vst2{neon_type[1].nox}"
     doc: "Store multiple 2-element structures from two registers"
@@ -5092,16 +5082,7 @@ intrinsics:
     types:
       - [i64, int64x1x2_t, int64x1_t]
     compose:
-      - LLVMLink:
-          name: 'st2.{neon_type[1]}'
-          arguments:
-            - 'a: {type[2]}'
-            - 'b: {type[2]}'
-            - 'ptr: *mut i8'
-          links:
-            - link: 'llvm.aarch64.neon.st2.v{neon_type[1].lane}{type[0]}.p0'
-              arch: aarch64,arm64ec
-      - FnCall: ['_vst2{neon_type[1].nox}', ['b.0', 'b.1', 'a as _']]
+      - FnCall: [core::ptr::write_unaligned, ['a.cast()', b]]
 
   - name: "vst2{neon_type[1].nox}"
     doc: "Store multiple 2-element structures from two registers"
@@ -5416,17 +5397,7 @@ intrinsics:
     types:
       - [i64, int64x1x3_t, int64x1_t]
     compose:
-      - LLVMLink:
-          name: 'st3.{neon_type[1].nox}'
-          arguments:
-            - 'a: {type[2]}'
-            - 'b: {type[2]}'
-            - 'c: {type[2]}'
-            - 'ptr: *mut i8'
-          links:
-            - link: 'llvm.aarch64.neon.st3.v{neon_type[1].lane}{type[0]}.p0'
-              arch: aarch64,arm64ec
-      - FnCall: ['_vst3{neon_type[1].nox}', ['b.0', 'b.1', 'b.2', 'a as _']]
+      - FnCall: [core::ptr::write_unaligned, ['a.cast()', b]]
 
   - name: "vst3{neon_type[1].nox}"
     doc: "Store multiple 3-element structures from three registers"
@@ -5461,18 +5432,7 @@ intrinsics:
     types:
       - [i64, int64x1x3_t, int64x1_t]
     compose:
-      - LLVMLink:
-          name: 'vst3.{neon_type[1]}'
-          arguments:
-            - 'ptr: *mut i8'
-            - 'a: {type[2]}'
-            - 'b: {type[2]}'
-            - 'c: {type[2]}'
-            - 'size: i32'
-          links:
-            - link: 'llvm.arm.neon.vst3.p0.v{neon_type[1].lane}{type[0]}'
-              arch: arm
-      - FnCall: ['_vst3{neon_type[1].nox}', ['a as _', 'b.0', 'b.1', 'b.2', '8']]
+      - FnCall: [core::ptr::write_unaligned, ['a.cast()', b]]
 
   - name: "vst3{neon_type[1].nox}"
     doc: "Store multiple 3-element structures from three registers"
@@ -5832,19 +5792,7 @@ intrinsics:
     types:
       - [i64, int64x1x4_t, int64x1_t]
     compose:
-      - LLVMLink:
-          name: 'vst4.{neon_type[1]}'
-          arguments:
-            - 'ptr: *mut i8'
-            - 'a: {type[2]}'
-            - 'b: {type[2]}'
-            - 'c: {type[2]}'
-            - 'd: {type[2]}'
-            - 'size: i32'
-          links:
-            - link: 'llvm.arm.neon.vst4.p0.v{neon_type[1].lane}{type[0]}'
-              arch: arm
-      - FnCall: ['_vst4{neon_type[1].nox}', ['a as _', 'b.0', 'b.1', 'b.2', 'b.3', '8']]
+      - FnCall: [core::ptr::write_unaligned, ['a.cast()', b]]
 
   - name: "vst4{neon_type[1].nox}"
     doc: "Store multiple 4-element structures from four registers"
@@ -5858,18 +5806,7 @@ intrinsics:
     types:
       - [i64, int64x1x4_t, int64x1_t]
     compose:
-      - LLVMLink:
-          name: 'vst4.{neon_type[1]}'
-          arguments:
-            - 'a: {type[2]}'
-            - 'b: {type[2]}'
-            - 'c: {type[2]}'
-            - 'd: {type[2]}'
-            - 'ptr: *mut i8'
-          links:
-            - link: 'llvm.aarch64.neon.st4.{neon_type[2]}.p0'
-              arch: aarch64,arm64ec
-      - FnCall: ['_vst4{neon_type[1].nox}', ['b.0', 'b.1', 'b.2', 'b.3', 'a as _']]
+      - FnCall: [core::ptr::write_unaligned, ['a.cast()', b]]
 
   - name: "vst4{neon_type[1].nox}"
     doc: "Store multiple 4-element structures from four registers"

From afb890e048205ba28b090c6607721dee2cb636bc Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert@folkertdev.nl>
Date: Wed, 18 Feb 2026 21:51:54 +0100
Subject: [PATCH 04/19] use `intrinsics::simd` for interleaving store of f16

---
 library/stdarch/crates/core_arch/src/aarch64/neon/mod.rs | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/library/stdarch/crates/core_arch/src/aarch64/neon/mod.rs b/library/stdarch/crates/core_arch/src/aarch64/neon/mod.rs
index 2df4ba7443314..2fbd2255aa0fd 100644
--- a/library/stdarch/crates/core_arch/src/aarch64/neon/mod.rs
+++ b/library/stdarch/crates/core_arch/src/aarch64/neon/mod.rs
@@ -1050,6 +1050,14 @@ mod tests {
         test_vld1q_f16_x2(f16, 16, float16x8x2_t, vst1q_f16_x2, vld1q_f16_x2);
         test_vld1q_f16_x3(f16, 24, float16x8x3_t, vst1q_f16_x3, vld1q_f16_x3);
         test_vld1q_f16_x4(f16, 32, float16x8x4_t, vst1q_f16_x4, vld1q_f16_x4);
+
+        test_vld2_f16_x2(f16, 8, float16x4x2_t, vst2_f16, vld2_f16);
+        test_vld2_f16_x3(f16, 12, float16x4x3_t, vst3_f16, vld3_f16);
+        test_vld2_f16_x4(f16, 16, float16x4x4_t, vst4_f16, vld4_f16);
+
+        test_vld2q_f16_x2(f16, 16, float16x8x2_t, vst2q_f16, vld2q_f16);
+        test_vld3q_f16_x3(f16, 24, float16x8x3_t, vst3q_f16, vld3q_f16);
+        test_vld4q_f16_x4(f16, 32, float16x8x4_t, vst4q_f16, vld4q_f16);
     }
 
     macro_rules! wide_store_load_roundtrip_aes {

From 2b656f99a5cd12722831a1a92365a8000b61602a Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert@folkertdev.nl>
Date: Wed, 18 Feb 2026 22:27:23 +0100
Subject: [PATCH 05/19] use `intrinsics::simd` for aarch64 interleaving `st`

---
 .../core_arch/src/aarch64/neon/generated.rs   | 83 ++---------------
 .../spec/neon/aarch64.spec.yml                | 92 ++++---------------
 2 files changed, 26 insertions(+), 149 deletions(-)

diff --git a/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs b/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs
index 88afaae8b80d3..41f01d445fc71 100644
--- a/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs
+++ b/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs
@@ -25039,16 +25039,9 @@ pub unsafe fn vst1q_lane_f64<const LANE: i32>(a: *mut f64, b: float64x2_t) {
 #[inline(always)]
 #[target_feature(enable = "neon")]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
-#[cfg_attr(test, assert_instr(st1))]
+#[cfg_attr(test, assert_instr(stp))]
 pub unsafe fn vst2_f64(a: *mut f64, b: float64x1x2_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.st2.v1f64.p0"
-        )]
-        fn _vst2_f64(a: float64x1_t, b: float64x1_t, ptr: *mut i8);
-    }
-    _vst2_f64(b.0, b.1, a as _)
+    core::ptr::write_unaligned(a.cast(), b)
 }
 #[doc = "Store multiple 2-element structures from two registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_lane_f64)"]
@@ -25125,14 +25118,7 @@ pub unsafe fn vst2_lane_u64<const LANE: i32>(a: *mut u64, b: uint64x1x2_t) {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(st2))]
 pub unsafe fn vst2q_f64(a: *mut f64, b: float64x2x2_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.st2.v2f64.p0"
-        )]
-        fn _vst2q_f64(a: float64x2_t, b: float64x2_t, ptr: *mut i8);
-    }
-    _vst2q_f64(b.0, b.1, a as _)
+    crate::core_arch::macros::interleaving_store!(f64, 2, 2, a, b)
 }
 #[doc = "Store multiple 2-element structures from two registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_s64)"]
@@ -25143,14 +25129,7 @@ pub unsafe fn vst2q_f64(a: *mut f64, b: float64x2x2_t) {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(st2))]
 pub unsafe fn vst2q_s64(a: *mut i64, b: int64x2x2_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.st2.v2i64.p0"
-        )]
-        fn _vst2q_s64(a: int64x2_t, b: int64x2_t, ptr: *mut i8);
-    }
-    _vst2q_s64(b.0, b.1, a as _)
+    crate::core_arch::macros::interleaving_store!(i64, 2, 2, a, b)
 }
 #[doc = "Store multiple 2-element structures from two registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_lane_f64)"]
@@ -25295,14 +25274,7 @@ pub unsafe fn vst2q_u64(a: *mut u64, b: uint64x2x2_t) {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(nop))]
 pub unsafe fn vst3_f64(a: *mut f64, b: float64x1x3_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.st3.v1f64.p0"
-        )]
-        fn _vst3_f64(a: float64x1_t, b: float64x1_t, c: float64x1_t, ptr: *mut i8);
-    }
-    _vst3_f64(b.0, b.1, b.2, a as _)
+    core::ptr::write_unaligned(a.cast(), b)
 }
 #[doc = "Store multiple 3-element structures from three registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_lane_f64)"]
@@ -25379,14 +25351,7 @@ pub unsafe fn vst3_lane_u64<const LANE: i32>(a: *mut u64, b: uint64x1x3_t) {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(st3))]
 pub unsafe fn vst3q_f64(a: *mut f64, b: float64x2x3_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.st3.v2f64.p0"
-        )]
-        fn _vst3q_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t, ptr: *mut i8);
-    }
-    _vst3q_f64(b.0, b.1, b.2, a as _)
+    crate::core_arch::macros::interleaving_store!(f64, 2, 3, a, b)
 }
 #[doc = "Store multiple 3-element structures from three registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_s64)"]
@@ -25397,14 +25362,7 @@ pub unsafe fn vst3q_f64(a: *mut f64, b: float64x2x3_t) {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(st3))]
 pub unsafe fn vst3q_s64(a: *mut i64, b: int64x2x3_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.st3.v2i64.p0"
-        )]
-        fn _vst3q_s64(a: int64x2_t, b: int64x2_t, c: int64x2_t, ptr: *mut i8);
-    }
-    _vst3q_s64(b.0, b.1, b.2, a as _)
+    crate::core_arch::macros::interleaving_store!(i64, 2, 3, a, b)
 }
 #[doc = "Store multiple 3-element structures from three registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_lane_f64)"]
@@ -25549,14 +25507,7 @@ pub unsafe fn vst3q_u64(a: *mut u64, b: uint64x2x3_t) {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(nop))]
 pub unsafe fn vst4_f64(a: *mut f64, b: float64x1x4_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.st4.v1f64.p0"
-        )]
-        fn _vst4_f64(a: float64x1_t, b: float64x1_t, c: float64x1_t, d: float64x1_t, ptr: *mut i8);
-    }
-    _vst4_f64(b.0, b.1, b.2, b.3, a as _)
+    core::ptr::write_unaligned(a.cast(), b)
 }
 #[doc = "Store multiple 4-element structures from four registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_lane_f64)"]
@@ -25647,14 +25598,7 @@ pub unsafe fn vst4_lane_u64<const LANE: i32>(a: *mut u64, b: uint64x1x4_t) {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(st4))]
 pub unsafe fn vst4q_f64(a: *mut f64, b: float64x2x4_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.st4.v2f64.p0"
-        )]
-        fn _vst4q_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t, d: float64x2_t, ptr: *mut i8);
-    }
-    _vst4q_f64(b.0, b.1, b.2, b.3, a as _)
+    crate::core_arch::macros::interleaving_store!(f64, 2, 4, a, b)
 }
 #[doc = "Store multiple 4-element structures from four registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_s64)"]
@@ -25665,14 +25609,7 @@ pub unsafe fn vst4q_f64(a: *mut f64, b: float64x2x4_t) {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(st4))]
 pub unsafe fn vst4q_s64(a: *mut i64, b: int64x2x4_t) {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.st4.v2i64.p0"
-        )]
-        fn _vst4q_s64(a: int64x2_t, b: int64x2_t, c: int64x2_t, d: int64x2_t, ptr: *mut i8);
-    }
-    _vst4q_s64(b.0, b.1, b.2, b.3, a as _)
+    crate::core_arch::macros::interleaving_store!(i64, 2, 4, a, b)
 }
 #[doc = "Store multiple 4-element structures from four registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_lane_f64)"]
diff --git a/library/stdarch/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml b/library/stdarch/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml
index 9190c8518a667..0ec8024fdfbb6 100644
--- a/library/stdarch/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml
+++ b/library/stdarch/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml
@@ -4567,20 +4567,11 @@ intrinsics:
       unsafe: [neon]
     attr:
       - *neon-stable
-    assert_instr: [st1]
+    assert_instr: [stp]
     types:
-      - ['f64', float64x1x2_t, float64x1_t]
+      - ['f64', float64x1x2_t]
     compose:
-      - LLVMLink:
-          name: 'st2.{neon_type[1]}'
-          arguments:
-            - 'a: {type[2]}'
-            - 'b: {type[2]}'
-            - 'ptr: *mut i8'
-          links:
-            - link: 'llvm.aarch64.neon.st2.v{neon_type[1].lane}{type[0]}.p0'
-              arch: aarch64,arm64ec
-      - FnCall: ['_vst2{neon_type[1].nox}', ['b.0', 'b.1', 'a as _']]
+      - FnCall: [core::ptr::write_unaligned, ['a.cast()', b]]
 
   - name: "vst2{neon_type[1].nox}"
     doc: "Store multiple 2-element structures from two registers"
@@ -4591,19 +4582,10 @@ intrinsics:
       - *neon-stable
     assert_instr: [st2]
     types:
-      - [i64, int64x2x2_t, int64x2_t]
-      - [f64, float64x2x2_t, float64x2_t]
+      - [i64, int64x2x2_t, "2"]
+      - [f64, float64x2x2_t, "2"]
     compose:
-      - LLVMLink:
-          name: 'st2.{neon_type[1]}'
-          arguments:
-            - 'a: {type[2]}'
-            - 'b: {type[2]}'
-            - 'ptr: *mut i8'
-          links:
-            - link: 'llvm.aarch64.neon.st2.v{neon_type[1].lane}{type[0]}.p0'
-              arch: aarch64,arm64ec
-      - FnCall: ['_vst2{neon_type[1].nox}', ['b.0', 'b.1', 'a as _']]
+      - FnCall: ["crate::core_arch::macros::interleaving_store!", [{ Type: "{type[0]}" }, "{type[2]}", "2", a, b], [], true]
 
   - name: "vst2{neon_type[1].lane_nox}"
     doc: "Store multiple 2-element structures from two registers"
@@ -4781,19 +4763,9 @@ intrinsics:
     safety:
       unsafe: [neon]
     types:
-      - [f64, float64x1x3_t, float64x1_t]
+      - [f64, float64x1x3_t]
     compose:
-      - LLVMLink:
-          name: 'st3.{neon_type[1].nox}'
-          arguments:
-            - 'a: {type[2]}'
-            - 'b: {type[2]}'
-            - 'c: {type[2]}'
-            - 'ptr: *mut i8'
-          links:
-            - link: 'llvm.aarch64.neon.st3.v{neon_type[1].lane}{type[0]}.p0'
-              arch: aarch64,arm64ec
-      - FnCall: ['_vst3{neon_type[1].nox}', ['b.0', 'b.1', 'b.2', 'a as _']]
+      - FnCall: [core::ptr::write_unaligned, ['a.cast()', b]]
 
   - name: "vst3{neon_type[1].lane_nox}"
     doc: "Store multiple 3-element structures from three registers"
@@ -4860,20 +4832,10 @@ intrinsics:
     safety:
       unsafe: [neon]
     types:
-      - [i64, int64x2x3_t, int64x2_t]
-      - [f64, float64x2x3_t, float64x2_t]
+      - [i64, int64x2x3_t, "2"]
+      - [f64, float64x2x3_t, "2"]
     compose:
-      - LLVMLink:
-          name: 'st3.{neon_type[1].nox}'
-          arguments:
-            - 'a: {type[2]}'
-            - 'b: {type[2]}'
-            - 'c: {type[2]}'
-            - 'ptr: *mut i8'
-          links:
-            - link: 'llvm.aarch64.neon.st3.v{neon_type[1].lane}{type[0]}.p0'
-              arch: aarch64,arm64ec
-      - FnCall: ['_vst3{neon_type[1].nox}', ['b.0', 'b.1', 'b.2', 'a as _']]
+      - FnCall: ["crate::core_arch::macros::interleaving_store!", [{ Type: "{type[0]}" }, "{type[2]}", "3", a, b], [], true]
 
   - name: "vst3{neon_type[1].nox}"
     doc: "Store multiple 3-element structures from three registers"
@@ -4995,20 +4957,9 @@ intrinsics:
     safety:
       unsafe: [neon]
     types:
-      - [f64, float64x1x4_t, float64x1_t]
+      - [f64, float64x1x4_t]
     compose:
-      - LLVMLink:
-          name: 'st4.{neon_type[1].nox}'
-          arguments:
-            - 'a: {type[2]}'
-            - 'b: {type[2]}'
-            - 'c: {type[2]}'
-            - 'd: {type[2]}'
-            - 'ptr: *mut i8'
-          links:
-            - link: 'llvm.aarch64.neon.st4.v{neon_type[1].lane}{type[0]}.p0'
-              arch: aarch64,arm64ec
-      - FnCall: ['_vst4{neon_type[1].nox}', ['b.0', 'b.1', 'b.2', 'b.3', 'a as _']]
+      - FnCall: [core::ptr::write_unaligned, ['a.cast()', b]]
 
   - name: "vst4{neon_type[1].lane_nox}"
     doc: "Store multiple 4-element structures from four registers"
@@ -5075,21 +5026,10 @@ intrinsics:
     safety:
       unsafe: [neon]
     types:
-      - [i64, int64x2x4_t, int64x2_t]
-      - [f64, float64x2x4_t, float64x2_t]
+      - [i64, int64x2x4_t, "2"]
+      - [f64, float64x2x4_t, "2"]
     compose:
-      - LLVMLink:
-          name: 'st4.{neon_type[1].nox}'
-          arguments:
-            - 'a: {type[2]}'
-            - 'b: {type[2]}'
-            - 'c: {type[2]}'
-            - 'd: {type[2]}'
-            - 'ptr: *mut i8'
-          links:
-            - link: 'llvm.aarch64.neon.st4.v{neon_type[1].lane}{type[0]}.p0'
-              arch: aarch64,arm64ec
-      - FnCall: ['_vst4{neon_type[1].nox}', ['b.0', 'b.1', 'b.2', 'b.3', 'a as _']]
+      - FnCall: ["crate::core_arch::macros::interleaving_store!", [{ Type: "{type[0]}" }, "{type[2]}", "4", a, b], [], true]
 
   - name: "vst4{neon_type[1].nox}"
     doc: "Store multiple 4-element structures from four registers"

From a5ba7941f93f3a619247e583e0a913e2c3a7613d Mon Sep 17 00:00:00 2001
From: okaneco <47607823+okaneco@users.noreply.github.com>
Date: Thu, 19 Feb 2026 06:50:17 -0500
Subject: [PATCH 06/19] x86: use `simd::intrinsics` for saturating packs

Use intrinsics for `sse2`, `sse41`, `avx2`, `avx512bw`

The majority of implementations make use of `simd_shuffle` since that
optimized through to the avx512 intrinsics that made use of the lower
target feature intrinsics. Combined with masked stores, instruction
tests would fail presumably due to the casting and clamping that
the compiler couldn't see through. This is a known weakness as seen
in the other masked stores like the truncating conversion stores.
---
 .../stdarch/crates/core_arch/src/x86/sse2.rs  | 67 +++++++++++++++----
 1 file changed, 55 insertions(+), 12 deletions(-)

diff --git a/library/stdarch/crates/core_arch/src/x86/sse2.rs b/library/stdarch/crates/core_arch/src/x86/sse2.rs
index f339a003df4d1..fbf62c362f51b 100644
--- a/library/stdarch/crates/core_arch/src/x86/sse2.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse2.rs
@@ -1484,7 +1484,7 @@ pub const fn _mm_move_epi64(a: __m128i) -> __m128i {
     }
 }
 
-/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// Converts packed signed 16-bit integers from `a` and `b` to packed 8-bit integers
 /// using signed saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16)
@@ -1493,10 +1493,27 @@ pub const fn _mm_move_epi64(a: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(packsswb))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(packsswb(a.as_i16x8(), b.as_i16x8())) }
+    unsafe {
+        let max = simd_splat(i16::from(i8::MAX));
+        let min = simd_splat(i16::from(i8::MIN));
+
+        let clamped_a = simd_imax(simd_imin(a.as_i16x8(), max), min)
+            .as_m128i()
+            .as_i8x16();
+        let clamped_b = simd_imax(simd_imin(b.as_i16x8(), max), min)
+            .as_m128i()
+            .as_i8x16();
+
+        // Shuffle the low i8 of each i16 from two concatenated vectors into
+        // the low bits of the result register.
+        const IDXS: [u32; 16] = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30];
+        let result: i8x16 = simd_shuffle!(clamped_a, clamped_b, IDXS);
+
+        result.as_m128i()
+    }
 }
 
-/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
+/// Converts packed signed 32-bit integers from `a` and `b` to packed 16-bit integers
 /// using signed saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32)
@@ -1505,10 +1522,23 @@ pub fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(packssdw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(packssdw(a.as_i32x4(), b.as_i32x4())) }
+    unsafe {
+        let max = simd_splat(i32::from(i16::MAX));
+        let min = simd_splat(i32::from(i16::MIN));
+
+        let clamped_a = simd_imax(simd_imin(a.as_i32x4(), max), min);
+        let clamped_b = simd_imax(simd_imin(b.as_i32x4(), max), min);
+
+        let clamped_a: i16x4 = simd_cast(clamped_a);
+        let clamped_b: i16x4 = simd_cast(clamped_b);
+
+        let a: i64 = transmute(clamped_a);
+        let b: i64 = transmute(clamped_b);
+        i64x2::new(a, b).as_m128i()
+    }
 }
 
-/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// Converts packed signed 16-bit integers from `a` and `b` to packed 8-bit integers
 /// using unsigned saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16)
@@ -1517,7 +1547,26 @@ pub fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(packuswb))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(packuswb(a.as_i16x8(), b.as_i16x8())) }
+    unsafe {
+        let max = simd_splat(i16::from(u8::MAX));
+        let min = simd_splat(i16::from(u8::MIN));
+
+        let clamped_a = simd_imax(simd_imin(a.as_i16x8(), max), min)
+            .as_m128i()
+            .as_i8x16();
+        let clamped_b = simd_imax(simd_imin(b.as_i16x8(), max), min)
+            .as_m128i()
+            .as_i8x16();
+
+        // Shuffle the low bytes of each i16 from two concatenated vectors into
+        // the low bits of the result register.
+        // Without `simd_shuffle`, this intrinsic will cause the AVX-512BW
+        // `_mm_mask_packus_epi16` and `_mm_maskz_packus_epi16` tests to fail.
+        const IDXS: [u32; 16] = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30];
+        let result: i8x16 = simd_shuffle!(clamped_a, clamped_b, IDXS);
+
+        result.as_m128i()
+    }
 }
 
 /// Returns the `imm8` element of `a`.
@@ -3217,12 +3266,6 @@ unsafe extern "C" {
     fn cvtps2dq(a: __m128) -> i32x4;
     #[link_name = "llvm.x86.sse2.maskmov.dqu"]
     fn maskmovdqu(a: i8x16, mask: i8x16, mem_addr: *mut i8);
-    #[link_name = "llvm.x86.sse2.packsswb.128"]
-    fn packsswb(a: i16x8, b: i16x8) -> i8x16;
-    #[link_name = "llvm.x86.sse2.packssdw.128"]
-    fn packssdw(a: i32x4, b: i32x4) -> i16x8;
-    #[link_name = "llvm.x86.sse2.packuswb.128"]
-    fn packuswb(a: i16x8, b: i16x8) -> u8x16;
     #[link_name = "llvm.x86.sse2.max.sd"]
     fn maxsd(a: __m128d, b: __m128d) -> __m128d;
     #[link_name = "llvm.x86.sse2.max.pd"]

From 6fa33bea62865ebbfb58b88f689e4210b0769c1f Mon Sep 17 00:00:00 2001
From: okaneco <47607823+okaneco@users.noreply.github.com>
Date: Thu, 19 Feb 2026 07:05:07 -0500
Subject: [PATCH 07/19] Use intrinsics for `sse41`

---
 .../stdarch/crates/core_arch/src/x86/sse41.rs | 23 +++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/library/stdarch/crates/core_arch/src/x86/sse41.rs b/library/stdarch/crates/core_arch/src/x86/sse41.rs
index 7ad4306f36f21..8036f24e24d37 100644
--- a/library/stdarch/crates/core_arch/src/x86/sse41.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse41.rs
@@ -418,7 +418,7 @@ pub const fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
     unsafe { simd_imin(a.as_u32x4(), b.as_u32x4()).as_m128i() }
 }
 
-/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
+/// Converts packed signed 32-bit integers from `a` and `b` to packed 16-bit integers
 /// using unsigned saturation
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32)
@@ -427,7 +427,24 @@ pub const fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(packusdw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(packusdw(a.as_i32x4(), b.as_i32x4())) }
+    unsafe {
+        let max = simd_splat(i32::from(u16::MAX));
+        let min = simd_splat(i32::from(u16::MIN));
+
+        let clamped_a = simd_imax(simd_imin(a.as_i32x4(), max), min)
+            .as_m128i()
+            .as_i16x8();
+        let clamped_b = simd_imax(simd_imin(b.as_i32x4(), max), min)
+            .as_m128i()
+            .as_i16x8();
+
+        // Shuffle the low u16 of each i32 from two concatenated vectors into
+        // the low bits of the result register.
+        const IDXS: [u32; 8] = [0, 2, 4, 6, 8, 10, 12, 14];
+        let result: i16x8 = simd_shuffle!(clamped_a, clamped_b, IDXS);
+
+        result.as_m128i()
+    }
 }
 
 /// Compares packed 64-bit integers in `a` and `b` for equality
@@ -1166,8 +1183,6 @@ pub unsafe fn _mm_stream_load_si128(mem_addr: *const __m128i) -> __m128i {
 unsafe extern "C" {
     #[link_name = "llvm.x86.sse41.insertps"]
     fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128;
-    #[link_name = "llvm.x86.sse41.packusdw"]
-    fn packusdw(a: i32x4, b: i32x4) -> u16x8;
     #[link_name = "llvm.x86.sse41.dppd"]
     fn dppd(a: __m128d, b: __m128d, imm8: u8) -> __m128d;
     #[link_name = "llvm.x86.sse41.dpps"]

From 639b07ec7513d655fd674ce109ec3b3e165ff441 Mon Sep 17 00:00:00 2001
From: okaneco <47607823+okaneco@users.noreply.github.com>
Date: Thu, 19 Feb 2026 08:06:24 -0500
Subject: [PATCH 08/19] Use intrinsics for `avx2`

---
 .../stdarch/crates/core_arch/src/x86/avx2.rs  | 108 +++++++++++++++---
 1 file changed, 92 insertions(+), 16 deletions(-)

diff --git a/library/stdarch/crates/core_arch/src/x86/avx2.rs b/library/stdarch/crates/core_arch/src/x86/avx2.rs
index 04a88e461f752..ca4ca9a2de9a4 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx2.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx2.rs
@@ -2315,7 +2315,7 @@ pub const fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
     unsafe { transmute(simd_or(a.as_i32x8(), b.as_i32x8())) }
 }
 
-/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// Converts packed signed 16-bit integers from `a` and `b` to packed 8-bit integers
 /// using signed saturation
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi16)
@@ -2324,10 +2324,31 @@ pub const fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpacksswb))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(packsswb(a.as_i16x16(), b.as_i16x16())) }
+    unsafe {
+        let max = simd_splat(i16::from(i8::MAX));
+        let min = simd_splat(i16::from(i8::MIN));
+
+        let clamped_a = simd_imax(simd_imin(a.as_i16x16(), max), min)
+            .as_m256i()
+            .as_i8x32();
+        let clamped_b = simd_imax(simd_imin(b.as_i16x16(), max), min)
+            .as_m256i()
+            .as_i8x32();
+
+        #[rustfmt::skip]
+        const IDXS: [u32; 32] = [
+            00, 02, 04, 06, 08, 10, 12, 14, // a-lo i16 to i8 conversions
+            32, 34, 36, 38, 40, 42, 44, 46, // b-lo
+            16, 18, 20, 22, 24, 26, 28, 30, // a-hi
+            48, 50, 52, 54, 56, 58, 60, 62, // b-hi
+        ];
+        let result: i8x32 = simd_shuffle!(clamped_a, clamped_b, IDXS);
+
+        result.as_m256i()
+    }
 }
 
-/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
+/// Converts packed signed 32-bit integers from `a` and `b` to packed 16-bit integers
 /// using signed saturation
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32)
@@ -2336,10 +2357,31 @@ pub fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpackssdw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(packssdw(a.as_i32x8(), b.as_i32x8())) }
+    unsafe {
+        let max = simd_splat(i32::from(i16::MAX));
+        let min = simd_splat(i32::from(i16::MIN));
+
+        let clamped_a = simd_imax(simd_imin(a.as_i32x8(), max), min)
+            .as_m256i()
+            .as_i16x16();
+        let clamped_b = simd_imax(simd_imin(b.as_i32x8(), max), min)
+            .as_m256i()
+            .as_i16x16();
+
+        #[rustfmt::skip]
+        const IDXS: [u32; 16] = [
+            00, 02, 04, 06, // a-lo i32 to i16 conversions
+            16, 18, 20, 22, // b-lo
+            08, 10, 12, 14, // a-hi
+            24, 26, 28, 30, // b-hi
+        ];
+        let result: i16x16 = simd_shuffle!(clamped_a, clamped_b, IDXS);
+
+        result.as_m256i()
+    }
 }
 
-/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// Converts packed signed 16-bit integers from `a` and `b` to packed 8-bit integers
 /// using unsigned saturation
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16)
@@ -2348,10 +2390,31 @@ pub fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpackuswb))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(packuswb(a.as_i16x16(), b.as_i16x16())) }
+    unsafe {
+        let max = simd_splat(i16::from(u8::MAX));
+        let min = simd_splat(i16::from(u8::MIN));
+
+        let clamped_a = simd_imax(simd_imin(a.as_i16x16(), max), min)
+            .as_m256i()
+            .as_i8x32();
+        let clamped_b = simd_imax(simd_imin(b.as_i16x16(), max), min)
+            .as_m256i()
+            .as_i8x32();
+
+        #[rustfmt::skip]
+        const IDXS: [u32; 32] = [
+            00, 02, 04, 06, 08, 10, 12, 14, // a-lo i16 to u8 conversions
+            32, 34, 36, 38, 40, 42, 44, 46, // b-lo
+            16, 18, 20, 22, 24, 26, 28, 30, // a-hi
+            48, 50, 52, 54, 56, 58, 60, 62, // b-hi
+        ];
+        let result: i8x32 = simd_shuffle!(clamped_a, clamped_b, IDXS);
+
+        result.as_m256i()
+    }
 }
 
-/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
+/// Converts packed signed 32-bit integers from `a` and `b` to packed 16-bit integers
 /// using unsigned saturation
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi32)
@@ -2360,7 +2423,28 @@ pub fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpackusdw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(packusdw(a.as_i32x8(), b.as_i32x8())) }
+    unsafe {
+        let max = simd_splat(i32::from(u16::MAX));
+        let min = simd_splat(i32::from(u16::MIN));
+
+        let clamped_a = simd_imax(simd_imin(a.as_i32x8(), max), min)
+            .as_m256i()
+            .as_i16x16();
+        let clamped_b = simd_imax(simd_imin(b.as_i32x8(), max), min)
+            .as_m256i()
+            .as_i16x16();
+
+        #[rustfmt::skip]
+        const IDXS: [u32; 16] = [
+            00, 02, 04, 06, // a-lo i32 to u16 conversions
+            16, 18, 20, 22, // b-lo
+            08, 10, 12, 14, // a-hi
+            24, 26, 28, 30, // b-hi
+        ];
+        let result: i16x16 = simd_shuffle!(clamped_a, clamped_b, IDXS);
+
+        result.as_m256i()
+    }
 }
 
 /// Permutes packed 32-bit integers from `a` according to the content of `b`.
@@ -3827,14 +3911,6 @@ unsafe extern "C" {
     fn mpsadbw(a: u8x32, b: u8x32, imm8: i8) -> u16x16;
     #[link_name = "llvm.x86.avx2.pmul.hr.sw"]
     fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16;
-    #[link_name = "llvm.x86.avx2.packsswb"]
-    fn packsswb(a: i16x16, b: i16x16) -> i8x32;
-    #[link_name = "llvm.x86.avx2.packssdw"]
-    fn packssdw(a: i32x8, b: i32x8) -> i16x16;
-    #[link_name = "llvm.x86.avx2.packuswb"]
-    fn packuswb(a: i16x16, b: i16x16) -> u8x32;
-    #[link_name = "llvm.x86.avx2.packusdw"]
-    fn packusdw(a: i32x8, b: i32x8) -> u16x16;
     #[link_name = "llvm.x86.avx2.psad.bw"]
     fn psadbw(a: u8x32, b: u8x32) -> u64x4;
     #[link_name = "llvm.x86.avx2.psign.b"]

From c7ecbd5e4b412a3ee76463037ac46ab401a62c5b Mon Sep 17 00:00:00 2001
From: okaneco <47607823+okaneco@users.noreply.github.com>
Date: Thu, 19 Feb 2026 08:42:08 -0500
Subject: [PATCH 09/19] Use intrinsics for `avx512bw`

---
 .../crates/core_arch/src/x86/avx512bw.rs      | 117 ++++++++++++++++--
 1 file changed, 104 insertions(+), 13 deletions(-)

diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
index 3ba171c0fa50f..78801e8902107 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
@@ -6524,7 +6524,32 @@ pub fn _mm_maskz_maddubs_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpackssdw))]
 pub fn _mm512_packs_epi32(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vpackssdw(a.as_i32x16(), b.as_i32x16())) }
+    unsafe {
+        let max = simd_splat(i32::from(i16::MAX));
+        let min = simd_splat(i32::from(i16::MIN));
+
+        let clamped_a = simd_imax(simd_imin(a.as_i32x16(), max), min)
+            .as_m512i()
+            .as_i16x32();
+        let clamped_b = simd_imax(simd_imin(b.as_i32x16(), max), min)
+            .as_m512i()
+            .as_i16x32();
+
+        #[rustfmt::skip]
+        const IDXS: [u32; 32] = [
+            00, 02, 04, 06,
+            32, 34, 36, 38,
+            08, 10, 12, 14,
+            40, 42, 44, 46,
+            16, 18, 20, 22,
+            48, 50, 52, 54,
+            24, 26, 28, 30,
+            56, 58, 60, 62,
+        ];
+        let result: i16x32 = simd_shuffle!(clamped_a, clamped_b, IDXS);
+
+        result.as_m512i()
+    }
 }
 
 /// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6619,7 +6644,32 @@ pub fn _mm_maskz_packs_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpacksswb))]
 pub fn _mm512_packs_epi16(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vpacksswb(a.as_i16x32(), b.as_i16x32())) }
+    unsafe {
+        let max = simd_splat(i16::from(i8::MAX));
+        let min = simd_splat(i16::from(i8::MIN));
+
+        let clamped_a = simd_imax(simd_imin(a.as_i16x32(), max), min)
+            .as_m512i()
+            .as_i8x64();
+        let clamped_b = simd_imax(simd_imin(b.as_i16x32(), max), min)
+            .as_m512i()
+            .as_i8x64();
+
+        #[rustfmt::skip]
+        const IDXS: [u32; 64] = [
+            000, 002, 004, 006, 008, 010, 012, 014,
+            064, 066, 068, 070, 072, 074, 076, 078,
+            016, 018, 020, 022, 024, 026, 028, 030,
+            080, 082, 084, 086, 088, 090, 092, 094,
+            032, 034, 036, 038, 040, 042, 044, 046,
+            096, 098, 100, 102, 104, 106, 108, 110,
+            048, 050, 052, 054, 056, 058, 060, 062,
+            112, 114, 116, 118, 120, 122, 124, 126,
+        ];
+        let result: i8x64 = simd_shuffle!(clamped_a, clamped_b, IDXS);
+
+        result.as_m512i()
+    }
 }
 
 /// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6714,7 +6764,32 @@ pub fn _mm_maskz_packs_epi16(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpackusdw))]
 pub fn _mm512_packus_epi32(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vpackusdw(a.as_i32x16(), b.as_i32x16())) }
+    unsafe {
+        let max = simd_splat(i32::from(u16::MAX));
+        let min = simd_splat(i32::from(u16::MIN));
+
+        let clamped_a = simd_imax(simd_imin(a.as_i32x16(), max), min)
+            .as_m512i()
+            .as_i16x32();
+        let clamped_b = simd_imax(simd_imin(b.as_i32x16(), max), min)
+            .as_m512i()
+            .as_i16x32();
+
+        #[rustfmt::skip]
+        const IDXS: [u32; 32] = [
+            00, 02, 04, 06,
+            32, 34, 36, 38,
+            08, 10, 12, 14,
+            40, 42, 44, 46,
+            16, 18, 20, 22,
+            48, 50, 52, 54,
+            24, 26, 28, 30,
+            56, 58, 60, 62,
+        ];
+        let result: i16x32 = simd_shuffle!(clamped_a, clamped_b, IDXS);
+
+        result.as_m512i()
+    }
 }
 
 /// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6809,7 +6884,32 @@ pub fn _mm_maskz_packus_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpackuswb))]
 pub fn _mm512_packus_epi16(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vpackuswb(a.as_i16x32(), b.as_i16x32())) }
+    unsafe {
+        let max = simd_splat(i16::from(u8::MAX));
+        let min = simd_splat(i16::from(u8::MIN));
+
+        let clamped_a = simd_imax(simd_imin(a.as_i16x32(), max), min)
+            .as_m512i()
+            .as_i8x64();
+        let clamped_b = simd_imax(simd_imin(b.as_i16x32(), max), min)
+            .as_m512i()
+            .as_i8x64();
+
+        #[rustfmt::skip]
+        const IDXS: [u32; 64] = [
+            000, 002, 004, 006, 008, 010, 012, 014,
+            064, 066, 068, 070, 072, 074, 076, 078,
+            016, 018, 020, 022, 024, 026, 028, 030,
+            080, 082, 084, 086, 088, 090, 092, 094,
+            032, 034, 036, 038, 040, 042, 044, 046,
+            096, 098, 100, 102, 104, 106, 108, 110,
+            048, 050, 052, 054, 056, 058, 060, 062,
+            112, 114, 116, 118, 120, 122, 124, 126,
+        ];
+        let result: i8x64 = simd_shuffle!(clamped_a, clamped_b, IDXS);
+
+        result.as_m512i()
+    }
 }
 
 /// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -12606,15 +12706,6 @@ unsafe extern "C" {
     #[link_name = "llvm.x86.avx512.pmaddubs.w.512"]
     fn vpmaddubsw(a: u8x64, b: i8x64) -> i16x32;
 
-    #[link_name = "llvm.x86.avx512.packssdw.512"]
-    fn vpackssdw(a: i32x16, b: i32x16) -> i16x32;
-    #[link_name = "llvm.x86.avx512.packsswb.512"]
-    fn vpacksswb(a: i16x32, b: i16x32) -> i8x64;
-    #[link_name = "llvm.x86.avx512.packusdw.512"]
-    fn vpackusdw(a: i32x16, b: i32x16) -> u16x32;
-    #[link_name = "llvm.x86.avx512.packuswb.512"]
-    fn vpackuswb(a: i16x32, b: i16x32) -> u8x64;
-
     #[link_name = "llvm.x86.avx512.psll.w.512"]
     fn vpsllw(a: i16x32, count: i16x8) -> i16x32;
 

From 22d10c516d0b2cb1dd90a489bfd5402a96106f3e Mon Sep 17 00:00:00 2001
From: okaneco <47607823+okaneco@users.noreply.github.com>
Date: Fri, 20 Feb 2026 06:53:07 -0500
Subject: [PATCH 10/19] x86: Followup to add const for pack intrinsics

Add const to `sse2`, `sse41`, `avx2`, and `avx512bw` functions and tests
---
 .../crates/core_arch/src/x86/avx512bw.rs      | 30 +++++++++++--------
 .../stdarch/crates/core_arch/src/x86/sse2.rs  | 27 +++++++++--------
 2 files changed, 33 insertions(+), 24 deletions(-)

diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
index 78801e8902107..8481edcdb38d6 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
@@ -6615,7 +6615,8 @@ pub fn _mm256_maskz_packs_epi32(k: __mmask16, a: __m256i, b: __m256i) -> __m256i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpackssdw))]
-pub fn _mm_mask_packs_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
+pub const fn _mm_mask_packs_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
     unsafe {
         let pack = _mm_packs_epi32(a, b).as_i16x8();
         transmute(simd_select_bitmask(k, pack, src.as_i16x8()))
@@ -6629,7 +6630,8 @@ pub fn _mm_mask_packs_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpackssdw))]
-pub fn _mm_maskz_packs_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
+pub const fn _mm_maskz_packs_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
     unsafe {
         let pack = _mm_packs_epi32(a, b).as_i16x8();
         transmute(simd_select_bitmask(k, pack, i16x8::ZERO))
@@ -6735,7 +6737,8 @@ pub fn _mm256_maskz_packs_epi16(k: __mmask32, a: __m256i, b: __m256i) -> __m256i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpacksswb))]
-pub fn _mm_mask_packs_epi16(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
+pub const fn _mm_mask_packs_epi16(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
     unsafe {
         let pack = _mm_packs_epi16(a, b).as_i8x16();
         transmute(simd_select_bitmask(k, pack, src.as_i8x16()))
@@ -6749,7 +6752,8 @@ pub fn _mm_mask_packs_epi16(src: __m128i, k: __mmask16, a: __m128i, b: __m128i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpacksswb))]
-pub fn _mm_maskz_packs_epi16(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
+pub const fn _mm_maskz_packs_epi16(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
     unsafe {
         let pack = _mm_packs_epi16(a, b).as_i8x16();
         transmute(simd_select_bitmask(k, pack, i8x16::ZERO))
@@ -6975,7 +6979,8 @@ pub fn _mm256_maskz_packus_epi16(k: __mmask32, a: __m256i, b: __m256i) -> __m256
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpackuswb))]
-pub fn _mm_mask_packus_epi16(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
+pub const fn _mm_mask_packus_epi16(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
     unsafe {
         let pack = _mm_packus_epi16(a, b).as_i8x16();
         transmute(simd_select_bitmask(k, pack, src.as_i8x16()))
@@ -6989,7 +6994,8 @@ pub fn _mm_mask_packus_epi16(src: __m128i, k: __mmask16, a: __m128i, b: __m128i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpackuswb))]
-pub fn _mm_maskz_packus_epi16(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
+pub const fn _mm_maskz_packus_epi16(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
     unsafe {
         let pack = _mm_packus_epi16(a, b).as_i8x16();
         transmute(simd_select_bitmask(k, pack, i8x16::ZERO))
@@ -17854,7 +17860,7 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    fn test_mm_mask_packs_epi32() {
+    const fn test_mm_mask_packs_epi32() {
         let a = _mm_set1_epi32(i32::MAX);
         let b = _mm_set1_epi32(1 << 16 | 1);
         let r = _mm_mask_packs_epi32(a, 0, a, b);
@@ -17865,7 +17871,7 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    fn test_mm_maskz_packs_epi32() {
+    const fn test_mm_maskz_packs_epi32() {
         let a = _mm_set1_epi32(i32::MAX);
         let b = _mm_set1_epi32(1);
         let r = _mm_maskz_packs_epi32(0, a, b);
@@ -17954,7 +17960,7 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    fn test_mm_mask_packs_epi16() {
+    const fn test_mm_mask_packs_epi16() {
         let a = _mm_set1_epi16(i16::MAX);
         let b = _mm_set1_epi16(1 << 8 | 1);
         let r = _mm_mask_packs_epi16(a, 0, a, b);
@@ -17966,7 +17972,7 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    fn test_mm_maskz_packs_epi16() {
+    const fn test_mm_maskz_packs_epi16() {
         let a = _mm_set1_epi16(i16::MAX);
         let b = _mm_set1_epi16(1);
         let r = _mm_maskz_packs_epi16(0, a, b);
@@ -18137,7 +18143,7 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    fn test_mm_mask_packus_epi16() {
+    const fn test_mm_mask_packus_epi16() {
         let a = _mm_set1_epi16(-1);
         let b = _mm_set1_epi16(1 << 8 | 1);
         let r = _mm_mask_packus_epi16(a, 0, a, b);
@@ -18148,7 +18154,7 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    fn test_mm_maskz_packus_epi16() {
+    const fn test_mm_maskz_packus_epi16() {
         let a = _mm_set1_epi16(-1);
         let b = _mm_set1_epi16(1);
         let r = _mm_maskz_packus_epi16(0, a, b);
diff --git a/library/stdarch/crates/core_arch/src/x86/sse2.rs b/library/stdarch/crates/core_arch/src/x86/sse2.rs
index fbf62c362f51b..1f97f3c69d0e3 100644
--- a/library/stdarch/crates/core_arch/src/x86/sse2.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse2.rs
@@ -1492,10 +1492,11 @@ pub const fn _mm_move_epi64(a: __m128i) -> __m128i {
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(packsswb))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
+pub const fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
     unsafe {
-        let max = simd_splat(i16::from(i8::MAX));
-        let min = simd_splat(i16::from(i8::MIN));
+        let max = simd_splat(i8::MAX as i16);
+        let min = simd_splat(i8::MIN as i16);
 
         let clamped_a = simd_imax(simd_imin(a.as_i16x8(), max), min)
             .as_m128i()
@@ -1521,10 +1522,11 @@ pub fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(packssdw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i {
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
+pub const fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i {
     unsafe {
-        let max = simd_splat(i32::from(i16::MAX));
-        let min = simd_splat(i32::from(i16::MIN));
+        let max = simd_splat(i16::MAX as i32);
+        let min = simd_splat(i16::MIN as i32);
 
         let clamped_a = simd_imax(simd_imin(a.as_i32x4(), max), min);
         let clamped_b = simd_imax(simd_imin(b.as_i32x4(), max), min);
@@ -1546,10 +1548,11 @@ pub fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i {
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(packuswb))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i {
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
+pub const fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i {
     unsafe {
-        let max = simd_splat(i16::from(u8::MAX));
-        let min = simd_splat(i16::from(u8::MIN));
+        let max = simd_splat(u8::MAX as i16);
+        let min = simd_splat(u8::MIN as i16);
 
         let clamped_a = simd_imax(simd_imin(a.as_i16x8(), max), min)
             .as_m128i()
@@ -4329,7 +4332,7 @@ mod tests {
     }
 
     #[simd_test(enable = "sse2")]
-    fn test_mm_packs_epi16() {
+    const fn test_mm_packs_epi16() {
         let a = _mm_setr_epi16(0x80, -0x81, 0, 0, 0, 0, 0, 0);
         let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -0x81, 0x80);
         let r = _mm_packs_epi16(a, b);
@@ -4343,7 +4346,7 @@ mod tests {
     }
 
     #[simd_test(enable = "sse2")]
-    fn test_mm_packs_epi32() {
+    const fn test_mm_packs_epi32() {
         let a = _mm_setr_epi32(0x8000, -0x8001, 0, 0);
         let b = _mm_setr_epi32(0, 0, -0x8001, 0x8000);
         let r = _mm_packs_epi32(a, b);
@@ -4354,7 +4357,7 @@ mod tests {
     }
 
     #[simd_test(enable = "sse2")]
-    fn test_mm_packus_epi16() {
+    const fn test_mm_packus_epi16() {
         let a = _mm_setr_epi16(0x100, -1, 0, 0, 0, 0, 0, 0);
         let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -1, 0x100);
         let r = _mm_packus_epi16(a, b);

From 17a4c8ea8fa5c17fb698e6a3f3194ef1ac79c7d5 Mon Sep 17 00:00:00 2001
From: okaneco <47607823+okaneco@users.noreply.github.com>
Date: Fri, 20 Feb 2026 07:02:29 -0500
Subject: [PATCH 11/19] Add const to `sse41` intrinsics

---
 library/stdarch/crates/core_arch/src/x86/avx512bw.rs | 10 ++++++----
 library/stdarch/crates/core_arch/src/x86/sse41.rs    |  9 +++++----
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
index 8481edcdb38d6..360b755d5818a 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
@@ -6859,7 +6859,8 @@ pub fn _mm256_maskz_packus_epi32(k: __mmask16, a: __m256i, b: __m256i) -> __m256
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpackusdw))]
-pub fn _mm_mask_packus_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
+pub const fn _mm_mask_packus_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
     unsafe {
         let pack = _mm_packus_epi32(a, b).as_i16x8();
         transmute(simd_select_bitmask(k, pack, src.as_i16x8()))
@@ -6873,7 +6874,8 @@ pub fn _mm_mask_packus_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpackusdw))]
-pub fn _mm_maskz_packus_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
+pub const fn _mm_maskz_packus_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
     unsafe {
         let pack = _mm_packus_epi32(a, b).as_i16x8();
         transmute(simd_select_bitmask(k, pack, i16x8::ZERO))
@@ -18043,7 +18045,7 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    fn test_mm_mask_packus_epi32() {
+    const fn test_mm_mask_packus_epi32() {
         let a = _mm_set1_epi32(-1);
         let b = _mm_set1_epi32(1 << 16 | 1);
         let r = _mm_mask_packus_epi32(a, 0, a, b);
@@ -18054,7 +18056,7 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    fn test_mm_maskz_packus_epi32() {
+    const fn test_mm_maskz_packus_epi32() {
         let a = _mm_set1_epi32(-1);
         let b = _mm_set1_epi32(1);
         let r = _mm_maskz_packus_epi32(0, a, b);
diff --git a/library/stdarch/crates/core_arch/src/x86/sse41.rs b/library/stdarch/crates/core_arch/src/x86/sse41.rs
index 8036f24e24d37..4ebf7d3bd39a8 100644
--- a/library/stdarch/crates/core_arch/src/x86/sse41.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse41.rs
@@ -426,10 +426,11 @@ pub const fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(packusdw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i {
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
+pub const fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i {
     unsafe {
-        let max = simd_splat(i32::from(u16::MAX));
-        let min = simd_splat(i32::from(u16::MIN));
+        let max = simd_splat(u16::MAX as i32);
+        let min = simd_splat(u16::MIN as i32);
 
         let clamped_a = simd_imax(simd_imin(a.as_i32x4(), max), min)
             .as_m128i()
@@ -1470,7 +1471,7 @@ mod tests {
     }
 
     #[simd_test(enable = "sse4.1")]
-    fn test_mm_packus_epi32() {
+    const fn test_mm_packus_epi32() {
         let a = _mm_setr_epi32(1, 2, 3, 4);
         let b = _mm_setr_epi32(-1, -2, -3, -4);
         let r = _mm_packus_epi32(a, b);

From 4f881c4b89bc86577eeb63767292bac5d5896270 Mon Sep 17 00:00:00 2001
From: okaneco <47607823+okaneco@users.noreply.github.com>
Date: Fri, 20 Feb 2026 07:10:51 -0500
Subject: [PATCH 12/19] Add const to `avx2` intrinsics

---
 .../stdarch/crates/core_arch/src/x86/avx2.rs  | 36 ++++++------
 .../crates/core_arch/src/x86/avx512bw.rs      | 55 ++++++++++++++-----
 2 files changed, 61 insertions(+), 30 deletions(-)

diff --git a/library/stdarch/crates/core_arch/src/x86/avx2.rs b/library/stdarch/crates/core_arch/src/x86/avx2.rs
index ca4ca9a2de9a4..b49ad9522a412 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx2.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx2.rs
@@ -2323,10 +2323,11 @@ pub const fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpacksswb))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
+pub const fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
     unsafe {
-        let max = simd_splat(i16::from(i8::MAX));
-        let min = simd_splat(i16::from(i8::MIN));
+        let max = simd_splat(i8::MAX as i16);
+        let min = simd_splat(i8::MIN as i16);
 
         let clamped_a = simd_imax(simd_imin(a.as_i16x16(), max), min)
             .as_m256i()
@@ -2356,10 +2357,11 @@ pub fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpackssdw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
+pub const fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
     unsafe {
-        let max = simd_splat(i32::from(i16::MAX));
-        let min = simd_splat(i32::from(i16::MIN));
+        let max = simd_splat(i16::MAX as i32);
+        let min = simd_splat(i16::MIN as i32);
 
         let clamped_a = simd_imax(simd_imin(a.as_i32x8(), max), min)
             .as_m256i()
@@ -2389,10 +2391,11 @@ pub fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpackuswb))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
+pub const fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
     unsafe {
-        let max = simd_splat(i16::from(u8::MAX));
-        let min = simd_splat(i16::from(u8::MIN));
+        let max = simd_splat(u8::MAX as i16);
+        let min = simd_splat(u8::MIN as i16);
 
         let clamped_a = simd_imax(simd_imin(a.as_i16x16(), max), min)
             .as_m256i()
@@ -2422,10 +2425,11 @@ pub fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpackusdw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
+pub const fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
     unsafe {
-        let max = simd_splat(i32::from(u16::MAX));
-        let min = simd_splat(i32::from(u16::MIN));
+        let max = simd_splat(u16::MAX as i32);
+        let min = simd_splat(u16::MIN as i32);
 
         let clamped_a = simd_imax(simd_imin(a.as_i32x8(), max), min)
             .as_m256i()
@@ -5064,7 +5068,7 @@ mod tests {
     }
 
     #[simd_test(enable = "avx2")]
-    fn test_mm256_packs_epi16() {
+    const fn test_mm256_packs_epi16() {
         let a = _mm256_set1_epi16(2);
         let b = _mm256_set1_epi16(4);
         let r = _mm256_packs_epi16(a, b);
@@ -5080,7 +5084,7 @@ mod tests {
     }
 
     #[simd_test(enable = "avx2")]
-    fn test_mm256_packs_epi32() {
+    const fn test_mm256_packs_epi32() {
         let a = _mm256_set1_epi32(2);
         let b = _mm256_set1_epi32(4);
         let r = _mm256_packs_epi32(a, b);
@@ -5090,7 +5094,7 @@ mod tests {
     }
 
     #[simd_test(enable = "avx2")]
-    fn test_mm256_packus_epi16() {
+    const fn test_mm256_packus_epi16() {
         let a = _mm256_set1_epi16(2);
         let b = _mm256_set1_epi16(4);
         let r = _mm256_packus_epi16(a, b);
@@ -5106,7 +5110,7 @@ mod tests {
     }
 
     #[simd_test(enable = "avx2")]
-    fn test_mm256_packus_epi32() {
+    const fn test_mm256_packus_epi32() {
         let a = _mm256_set1_epi32(2);
         let b = _mm256_set1_epi32(4);
         let r = _mm256_packus_epi32(a, b);
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
index 360b755d5818a..8c7921fc18019 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
@@ -6587,7 +6587,13 @@ pub fn _mm512_maskz_packs_epi32(k: __mmask32, a: __m512i, b: __m512i) -> __m512i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpackssdw))]
-pub fn _mm256_mask_packs_epi32(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
+pub const fn _mm256_mask_packs_epi32(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
     unsafe {
         let pack = _mm256_packs_epi32(a, b).as_i16x16();
         transmute(simd_select_bitmask(k, pack, src.as_i16x16()))
@@ -6709,7 +6715,13 @@ pub fn _mm512_maskz_packs_epi16(k: __mmask64, a: __m512i, b: __m512i) -> __m512i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpacksswb))]
-pub fn _mm256_mask_packs_epi16(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
+pub const fn _mm256_mask_packs_epi16(
+    src: __m256i,
+    k: __mmask32,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
     unsafe {
         let pack = _mm256_packs_epi16(a, b).as_i8x32();
         transmute(simd_select_bitmask(k, pack, src.as_i8x32()))
@@ -6723,7 +6735,8 @@ pub fn _mm256_mask_packs_epi16(src: __m256i, k: __mmask32, a: __m256i, b: __m256
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpacksswb))]
-pub fn _mm256_maskz_packs_epi16(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
+pub const fn _mm256_maskz_packs_epi16(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
     unsafe {
         let pack = _mm256_packs_epi16(a, b).as_i8x32();
         transmute(simd_select_bitmask(k, pack, i8x32::ZERO))
@@ -6831,7 +6844,13 @@ pub fn _mm512_maskz_packus_epi32(k: __mmask32, a: __m512i, b: __m512i) -> __m512
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpackusdw))]
-pub fn _mm256_mask_packus_epi32(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
+pub const fn _mm256_mask_packus_epi32(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
     unsafe {
         let pack = _mm256_packus_epi32(a, b).as_i16x16();
         transmute(simd_select_bitmask(k, pack, src.as_i16x16()))
@@ -6845,7 +6864,8 @@ pub fn _mm256_mask_packus_epi32(src: __m256i, k: __mmask16, a: __m256i, b: __m25
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpackusdw))]
-pub fn _mm256_maskz_packus_epi32(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
+pub const fn _mm256_maskz_packus_epi32(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
     unsafe {
         let pack = _mm256_packus_epi32(a, b).as_i16x16();
         transmute(simd_select_bitmask(k, pack, i16x16::ZERO))
@@ -6953,7 +6973,13 @@ pub fn _mm512_maskz_packus_epi16(k: __mmask64, a: __m512i, b: __m512i) -> __m512
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpackuswb))]
-pub fn _mm256_mask_packus_epi16(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
+pub const fn _mm256_mask_packus_epi16(
+    src: __m256i,
+    k: __mmask32,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
     unsafe {
         let pack = _mm256_packus_epi16(a, b).as_i8x32();
         transmute(simd_select_bitmask(k, pack, src.as_i8x32()))
@@ -6967,7 +6993,8 @@ pub fn _mm256_mask_packus_epi16(src: __m256i, k: __mmask32, a: __m256i, b: __m25
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpackuswb))]
-pub fn _mm256_maskz_packus_epi16(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
+pub const fn _mm256_maskz_packus_epi16(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
     unsafe {
         let pack = _mm256_packus_epi16(a, b).as_i8x32();
         transmute(simd_select_bitmask(k, pack, i8x32::ZERO))
@@ -17838,7 +17865,7 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    fn test_mm256_mask_packs_epi32() {
+    const fn test_mm256_mask_packs_epi32() {
         let a = _mm256_set1_epi32(i32::MAX);
         let b = _mm256_set1_epi32(1 << 16 | 1);
         let r = _mm256_mask_packs_epi32(a, 0, a, b);
@@ -17936,7 +17963,7 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    fn test_mm256_mask_packs_epi16() {
+    const fn test_mm256_mask_packs_epi16() {
         let a = _mm256_set1_epi16(i16::MAX);
         let b = _mm256_set1_epi16(1 << 8 | 1);
         let r = _mm256_mask_packs_epi16(a, 0, a, b);
@@ -17949,7 +17976,7 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    fn test_mm256_maskz_packs_epi16() {
+    const fn test_mm256_maskz_packs_epi16() {
         let a = _mm256_set1_epi16(i16::MAX);
         let b = _mm256_set1_epi16(1);
         let r = _mm256_maskz_packs_epi16(0, a, b);
@@ -18023,7 +18050,7 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    fn test_mm256_mask_packus_epi32() {
+    const fn test_mm256_mask_packus_epi32() {
         let a = _mm256_set1_epi32(-1);
         let b = _mm256_set1_epi32(1 << 16 | 1);
         let r = _mm256_mask_packus_epi32(a, 0, a, b);
@@ -18034,7 +18061,7 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    fn test_mm256_maskz_packus_epi32() {
+    const fn test_mm256_maskz_packus_epi32() {
         let a = _mm256_set1_epi32(-1);
         let b = _mm256_set1_epi32(1);
         let r = _mm256_maskz_packus_epi32(0, a, b);
@@ -18119,7 +18146,7 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    fn test_mm256_mask_packus_epi16() {
+    const fn test_mm256_mask_packus_epi16() {
         let a = _mm256_set1_epi16(-1);
         let b = _mm256_set1_epi16(1 << 8 | 1);
         let r = _mm256_mask_packus_epi16(a, 0, a, b);
@@ -18132,7 +18159,7 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    fn test_mm256_maskz_packus_epi16() {
+    const fn test_mm256_maskz_packus_epi16() {
         let a = _mm256_set1_epi16(-1);
         let b = _mm256_set1_epi16(1);
         let r = _mm256_maskz_packus_epi16(0, a, b);

From 37bd99e2e0992495589c330d7478ec00f2dd8b85 Mon Sep 17 00:00:00 2001
From: okaneco <47607823+okaneco@users.noreply.github.com>
Date: Fri, 20 Feb 2026 07:30:10 -0500
Subject: [PATCH 13/19] Add const to `avx512bw` intrinsics

---
 .../crates/core_arch/src/x86/avx512bw.rs      | 96 ++++++++++++-------
 1 file changed, 64 insertions(+), 32 deletions(-)

diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
index 8c7921fc18019..b41f8576cfe54 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
@@ -6523,10 +6523,11 @@ pub fn _mm_maskz_maddubs_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
 #[target_feature(enable = "avx512bw")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpackssdw))]
-pub fn _mm512_packs_epi32(a: __m512i, b: __m512i) -> __m512i {
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
+pub const fn _mm512_packs_epi32(a: __m512i, b: __m512i) -> __m512i {
     unsafe {
-        let max = simd_splat(i32::from(i16::MAX));
-        let min = simd_splat(i32::from(i16::MIN));
+        let max = simd_splat(i16::MAX as i32);
+        let min = simd_splat(i16::MIN as i32);
 
         let clamped_a = simd_imax(simd_imin(a.as_i32x16(), max), min)
             .as_m512i()
@@ -6559,7 +6560,13 @@ pub fn _mm512_packs_epi32(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpackssdw))]
-pub fn _mm512_mask_packs_epi32(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
+pub const fn _mm512_mask_packs_epi32(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
     unsafe {
         let pack = _mm512_packs_epi32(a, b).as_i16x32();
         transmute(simd_select_bitmask(k, pack, src.as_i16x32()))
@@ -6573,7 +6580,8 @@ pub fn _mm512_mask_packs_epi32(src: __m512i, k: __mmask32, a: __m512i, b: __m512
 #[target_feature(enable = "avx512bw")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpackssdw))]
-pub fn _mm512_maskz_packs_epi32(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
+pub const fn _mm512_maskz_packs_epi32(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
     unsafe {
         let pack = _mm512_packs_epi32(a, b).as_i16x32();
         transmute(simd_select_bitmask(k, pack, i16x32::ZERO))
@@ -6651,10 +6659,11 @@ pub const fn _mm_maskz_packs_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m12
 #[target_feature(enable = "avx512bw")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpacksswb))]
-pub fn _mm512_packs_epi16(a: __m512i, b: __m512i) -> __m512i {
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
+pub const fn _mm512_packs_epi16(a: __m512i, b: __m512i) -> __m512i {
     unsafe {
-        let max = simd_splat(i16::from(i8::MAX));
-        let min = simd_splat(i16::from(i8::MIN));
+        let max = simd_splat(i8::MAX as i16);
+        let min = simd_splat(i8::MIN as i16);
 
         let clamped_a = simd_imax(simd_imin(a.as_i16x32(), max), min)
             .as_m512i()
@@ -6687,7 +6696,13 @@ pub fn _mm512_packs_epi16(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpacksswb))]
-pub fn _mm512_mask_packs_epi16(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
+pub const fn _mm512_mask_packs_epi16(
+    src: __m512i,
+    k: __mmask64,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
     unsafe {
         let pack = _mm512_packs_epi16(a, b).as_i8x64();
         transmute(simd_select_bitmask(k, pack, src.as_i8x64()))
@@ -6701,7 +6716,8 @@ pub fn _mm512_mask_packs_epi16(src: __m512i, k: __mmask64, a: __m512i, b: __m512
 #[target_feature(enable = "avx512bw")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpacksswb))]
-pub fn _mm512_maskz_packs_epi16(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
+pub const fn _mm512_maskz_packs_epi16(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
     unsafe {
         let pack = _mm512_packs_epi16(a, b).as_i8x64();
         transmute(simd_select_bitmask(k, pack, i8x64::ZERO))
@@ -6780,10 +6796,11 @@ pub const fn _mm_maskz_packs_epi16(k: __mmask16, a: __m128i, b: __m128i) -> __m1
 #[target_feature(enable = "avx512bw")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpackusdw))]
-pub fn _mm512_packus_epi32(a: __m512i, b: __m512i) -> __m512i {
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
+pub const fn _mm512_packus_epi32(a: __m512i, b: __m512i) -> __m512i {
     unsafe {
-        let max = simd_splat(i32::from(u16::MAX));
-        let min = simd_splat(i32::from(u16::MIN));
+        let max = simd_splat(u16::MAX as i32);
+        let min = simd_splat(u16::MIN as i32);
 
         let clamped_a = simd_imax(simd_imin(a.as_i32x16(), max), min)
             .as_m512i()
@@ -6816,7 +6833,13 @@ pub fn _mm512_packus_epi32(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpackusdw))]
-pub fn _mm512_mask_packus_epi32(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
+pub const fn _mm512_mask_packus_epi32(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
     unsafe {
         let pack = _mm512_packus_epi32(a, b).as_i16x32();
         transmute(simd_select_bitmask(k, pack, src.as_i16x32()))
@@ -6830,7 +6853,8 @@ pub fn _mm512_mask_packus_epi32(src: __m512i, k: __mmask32, a: __m512i, b: __m51
 #[target_feature(enable = "avx512bw")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpackusdw))]
-pub fn _mm512_maskz_packus_epi32(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
+pub const fn _mm512_maskz_packus_epi32(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
     unsafe {
         let pack = _mm512_packus_epi32(a, b).as_i16x32();
         transmute(simd_select_bitmask(k, pack, i16x32::ZERO))
@@ -6909,10 +6933,11 @@ pub const fn _mm_maskz_packus_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m1
 #[target_feature(enable = "avx512bw")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpackuswb))]
-pub fn _mm512_packus_epi16(a: __m512i, b: __m512i) -> __m512i {
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
+pub const fn _mm512_packus_epi16(a: __m512i, b: __m512i) -> __m512i {
     unsafe {
-        let max = simd_splat(i16::from(u8::MAX));
-        let min = simd_splat(i16::from(u8::MIN));
+        let max = simd_splat(u8::MAX as i16);
+        let min = simd_splat(u8::MIN as i16);
 
         let clamped_a = simd_imax(simd_imin(a.as_i16x32(), max), min)
             .as_m512i()
@@ -6945,7 +6970,13 @@ pub fn _mm512_packus_epi16(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpackuswb))]
-pub fn _mm512_mask_packus_epi16(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
+pub const fn _mm512_mask_packus_epi16(
+    src: __m512i,
+    k: __mmask64,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
     unsafe {
         let pack = _mm512_packus_epi16(a, b).as_i8x64();
         transmute(simd_select_bitmask(k, pack, src.as_i8x64()))
@@ -6959,7 +6990,8 @@ pub fn _mm512_mask_packus_epi16(src: __m512i, k: __mmask64, a: __m512i, b: __m51
 #[target_feature(enable = "avx512bw")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpackuswb))]
-pub fn _mm512_maskz_packus_epi16(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
+pub const fn _mm512_maskz_packus_epi16(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
     unsafe {
         let pack = _mm512_packus_epi16(a, b).as_i8x64();
         transmute(simd_select_bitmask(k, pack, i8x64::ZERO))
@@ -17828,7 +17860,7 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw")]
-    fn test_mm512_packs_epi32() {
+    const fn test_mm512_packs_epi32() {
         let a = _mm512_set1_epi32(i32::MAX);
         let b = _mm512_set1_epi32(1);
         let r = _mm512_packs_epi32(a, b);
@@ -17839,7 +17871,7 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw")]
-    fn test_mm512_mask_packs_epi32() {
+    const fn test_mm512_mask_packs_epi32() {
         let a = _mm512_set1_epi32(i32::MAX);
         let b = _mm512_set1_epi32(1 << 16 | 1);
         let r = _mm512_mask_packs_epi32(a, 0, a, b);
@@ -17852,7 +17884,7 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw")]
-    fn test_mm512_maskz_packs_epi32() {
+    const fn test_mm512_maskz_packs_epi32() {
         let a = _mm512_set1_epi32(i32::MAX);
         let b = _mm512_set1_epi32(1);
         let r = _mm512_maskz_packs_epi32(0, a, b);
@@ -17911,7 +17943,7 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw")]
-    fn test_mm512_packs_epi16() {
+    const fn test_mm512_packs_epi16() {
         let a = _mm512_set1_epi16(i16::MAX);
         let b = _mm512_set1_epi16(1);
         let r = _mm512_packs_epi16(a, b);
@@ -17924,7 +17956,7 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw")]
-    fn test_mm512_mask_packs_epi16() {
+    const fn test_mm512_mask_packs_epi16() {
         let a = _mm512_set1_epi16(i16::MAX);
         let b = _mm512_set1_epi16(1 << 8 | 1);
         let r = _mm512_mask_packs_epi16(a, 0, a, b);
@@ -17944,7 +17976,7 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw")]
-    fn test_mm512_maskz_packs_epi16() {
+    const fn test_mm512_maskz_packs_epi16() {
         let a = _mm512_set1_epi16(i16::MAX);
         let b = _mm512_set1_epi16(1);
         let r = _mm512_maskz_packs_epi16(0, a, b);
@@ -18013,7 +18045,7 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw")]
-    fn test_mm512_packus_epi32() {
+    const fn test_mm512_packus_epi32() {
         let a = _mm512_set1_epi32(-1);
         let b = _mm512_set1_epi32(1);
         let r = _mm512_packus_epi32(a, b);
@@ -18024,7 +18056,7 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw")]
-    fn test_mm512_mask_packus_epi32() {
+    const fn test_mm512_mask_packus_epi32() {
         let a = _mm512_set1_epi32(-1);
         let b = _mm512_set1_epi32(1 << 16 | 1);
         let r = _mm512_mask_packus_epi32(a, 0, a, b);
@@ -18037,7 +18069,7 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw")]
-    fn test_mm512_maskz_packus_epi32() {
+    const fn test_mm512_maskz_packus_epi32() {
         let a = _mm512_set1_epi32(-1);
         let b = _mm512_set1_epi32(1);
         let r = _mm512_maskz_packus_epi32(0, a, b);
@@ -18094,7 +18126,7 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw")]
-    fn test_mm512_packus_epi16() {
+    const fn test_mm512_packus_epi16() {
         let a = _mm512_set1_epi16(-1);
         let b = _mm512_set1_epi16(1);
         let r = _mm512_packus_epi16(a, b);
@@ -18107,7 +18139,7 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw")]
-    fn test_mm512_mask_packus_epi16() {
+    const fn test_mm512_mask_packus_epi16() {
         let a = _mm512_set1_epi16(-1);
         let b = _mm512_set1_epi16(1 << 8 | 1);
         let r = _mm512_mask_packus_epi16(a, 0, a, b);
@@ -18127,7 +18159,7 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw")]
-    fn test_mm512_maskz_packus_epi16() {
+    const fn test_mm512_maskz_packus_epi16() {
         let a = _mm512_set1_epi16(-1);
         let b = _mm512_set1_epi16(1);
         let r = _mm512_maskz_packus_epi16(0, a, b);

From 47ef7f73de0547680678ef1184ed1f6889e69614 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert@folkertdev.nl>
Date: Mon, 23 Feb 2026 11:58:18 +0100
Subject: [PATCH 14/19] aarch64: cleanup of some long array literals

---
 .../core_arch/src/aarch64/neon/generated.rs   | 539 ++----------------
 .../spec/neon/aarch64.spec.yml                | 108 ++--
 2 files changed, 108 insertions(+), 539 deletions(-)

diff --git a/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs b/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs
index 41f01d445fc71..de64839661d6e 100644
--- a/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs
+++ b/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs
@@ -14131,26 +14131,7 @@ pub fn vmlaq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vmlal_high_lane_s16<const LANE: i32>(a: int32x4_t, b: int16x8_t, c: int16x4_t) -> int32x4_t {
     static_assert_uimm_bits!(LANE, 2);
-    unsafe {
-        vmlal_high_s16(
-            a,
-            b,
-            simd_shuffle!(
-                c,
-                c,
-                [
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32
-                ]
-            ),
-        )
-    }
+    unsafe { vmlal_high_s16(a, b, simd_shuffle!(c, c, [LANE as u32; 8])) }
 }
 #[doc = "Multiply-add long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_laneq_s16)"]
@@ -14165,26 +14146,7 @@ pub fn vmlal_high_laneq_s16<const LANE: i32>(
     c: int16x8_t,
 ) -> int32x4_t {
     static_assert_uimm_bits!(LANE, 3);
-    unsafe {
-        vmlal_high_s16(
-            a,
-            b,
-            simd_shuffle!(
-                c,
-                c,
-                [
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32
-                ]
-            ),
-        )
-    }
+    unsafe { vmlal_high_s16(a, b, simd_shuffle!(c, c, [LANE as u32; 8])) }
 }
 #[doc = "Multiply-add long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_lane_s32)"]
@@ -14195,13 +14157,7 @@ pub fn vmlal_high_laneq_s16<const LANE: i32>(
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vmlal_high_lane_s32<const LANE: i32>(a: int64x2_t, b: int32x4_t, c: int32x2_t) -> int64x2_t {
     static_assert_uimm_bits!(LANE, 1);
-    unsafe {
-        vmlal_high_s32(
-            a,
-            b,
-            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
-        )
-    }
+    unsafe { vmlal_high_s32(a, b, simd_shuffle!(c, c, [LANE as u32; 4])) }
 }
 #[doc = "Multiply-add long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_laneq_s32)"]
@@ -14216,13 +14172,7 @@ pub fn vmlal_high_laneq_s32<const LANE: i32>(
     c: int32x4_t,
 ) -> int64x2_t {
     static_assert_uimm_bits!(LANE, 2);
-    unsafe {
-        vmlal_high_s32(
-            a,
-            b,
-            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
-        )
-    }
+    unsafe { vmlal_high_s32(a, b, simd_shuffle!(c, c, [LANE as u32; 4])) }
 }
 #[doc = "Multiply-add long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_lane_u16)"]
@@ -14237,26 +14187,7 @@ pub fn vmlal_high_lane_u16<const LANE: i32>(
     c: uint16x4_t,
 ) -> uint32x4_t {
     static_assert_uimm_bits!(LANE, 2);
-    unsafe {
-        vmlal_high_u16(
-            a,
-            b,
-            simd_shuffle!(
-                c,
-                c,
-                [
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32
-                ]
-            ),
-        )
-    }
+    unsafe { vmlal_high_u16(a, b, simd_shuffle!(c, c, [LANE as u32; 8])) }
 }
 #[doc = "Multiply-add long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_laneq_u16)"]
@@ -14271,26 +14202,7 @@ pub fn vmlal_high_laneq_u16<const LANE: i32>(
     c: uint16x8_t,
 ) -> uint32x4_t {
     static_assert_uimm_bits!(LANE, 3);
-    unsafe {
-        vmlal_high_u16(
-            a,
-            b,
-            simd_shuffle!(
-                c,
-                c,
-                [
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32
-                ]
-            ),
-        )
-    }
+    unsafe { vmlal_high_u16(a, b, simd_shuffle!(c, c, [LANE as u32; 8])) }
 }
 #[doc = "Multiply-add long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_lane_u32)"]
@@ -14305,13 +14217,7 @@ pub fn vmlal_high_lane_u32<const LANE: i32>(
     c: uint32x2_t,
 ) -> uint64x2_t {
     static_assert_uimm_bits!(LANE, 1);
-    unsafe {
-        vmlal_high_u32(
-            a,
-            b,
-            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
-        )
-    }
+    unsafe { vmlal_high_u32(a, b, simd_shuffle!(c, c, [LANE as u32; 4])) }
 }
 #[doc = "Multiply-add long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_laneq_u32)"]
@@ -14326,13 +14232,7 @@ pub fn vmlal_high_laneq_u32<const LANE: i32>(
     c: uint32x4_t,
 ) -> uint64x2_t {
     static_assert_uimm_bits!(LANE, 2);
-    unsafe {
-        vmlal_high_u32(
-            a,
-            b,
-            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
-        )
-    }
+    unsafe { vmlal_high_u32(a, b, simd_shuffle!(c, c, [LANE as u32; 4])) }
 }
 #[doc = "Multiply-add long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_n_s16)"]
@@ -14475,26 +14375,7 @@ pub fn vmlsq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vmlsl_high_lane_s16<const LANE: i32>(a: int32x4_t, b: int16x8_t, c: int16x4_t) -> int32x4_t {
     static_assert_uimm_bits!(LANE, 2);
-    unsafe {
-        vmlsl_high_s16(
-            a,
-            b,
-            simd_shuffle!(
-                c,
-                c,
-                [
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32
-                ]
-            ),
-        )
-    }
+    unsafe { vmlsl_high_s16(a, b, simd_shuffle!(c, c, [LANE as u32; 8])) }
 }
 #[doc = "Multiply-subtract long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_laneq_s16)"]
@@ -14509,26 +14390,7 @@ pub fn vmlsl_high_laneq_s16<const LANE: i32>(
     c: int16x8_t,
 ) -> int32x4_t {
     static_assert_uimm_bits!(LANE, 3);
-    unsafe {
-        vmlsl_high_s16(
-            a,
-            b,
-            simd_shuffle!(
-                c,
-                c,
-                [
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32
-                ]
-            ),
-        )
-    }
+    unsafe { vmlsl_high_s16(a, b, simd_shuffle!(c, c, [LANE as u32; 8])) }
 }
 #[doc = "Multiply-subtract long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_lane_s32)"]
@@ -14539,13 +14401,7 @@ pub fn vmlsl_high_laneq_s16<const LANE: i32>(
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vmlsl_high_lane_s32<const LANE: i32>(a: int64x2_t, b: int32x4_t, c: int32x2_t) -> int64x2_t {
     static_assert_uimm_bits!(LANE, 1);
-    unsafe {
-        vmlsl_high_s32(
-            a,
-            b,
-            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
-        )
-    }
+    unsafe { vmlsl_high_s32(a, b, simd_shuffle!(c, c, [LANE as u32; 4])) }
 }
 #[doc = "Multiply-subtract long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_laneq_s32)"]
@@ -14560,13 +14416,7 @@ pub fn vmlsl_high_laneq_s32<const LANE: i32>(
     c: int32x4_t,
 ) -> int64x2_t {
     static_assert_uimm_bits!(LANE, 2);
-    unsafe {
-        vmlsl_high_s32(
-            a,
-            b,
-            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
-        )
-    }
+    unsafe { vmlsl_high_s32(a, b, simd_shuffle!(c, c, [LANE as u32; 4])) }
 }
 #[doc = "Multiply-subtract long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_lane_u16)"]
@@ -14581,26 +14431,7 @@ pub fn vmlsl_high_lane_u16<const LANE: i32>(
     c: uint16x4_t,
 ) -> uint32x4_t {
     static_assert_uimm_bits!(LANE, 2);
-    unsafe {
-        vmlsl_high_u16(
-            a,
-            b,
-            simd_shuffle!(
-                c,
-                c,
-                [
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32
-                ]
-            ),
-        )
-    }
+    unsafe { vmlsl_high_u16(a, b, simd_shuffle!(c, c, [LANE as u32; 8])) }
 }
 #[doc = "Multiply-subtract long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_laneq_u16)"]
@@ -14615,26 +14446,7 @@ pub fn vmlsl_high_laneq_u16<const LANE: i32>(
     c: uint16x8_t,
 ) -> uint32x4_t {
     static_assert_uimm_bits!(LANE, 3);
-    unsafe {
-        vmlsl_high_u16(
-            a,
-            b,
-            simd_shuffle!(
-                c,
-                c,
-                [
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32
-                ]
-            ),
-        )
-    }
+    unsafe { vmlsl_high_u16(a, b, simd_shuffle!(c, c, [LANE as u32; 8])) }
 }
 #[doc = "Multiply-subtract long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_lane_u32)"]
@@ -14649,13 +14461,7 @@ pub fn vmlsl_high_lane_u32<const LANE: i32>(
     c: uint32x2_t,
 ) -> uint64x2_t {
     static_assert_uimm_bits!(LANE, 1);
-    unsafe {
-        vmlsl_high_u32(
-            a,
-            b,
-            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
-        )
-    }
+    unsafe { vmlsl_high_u32(a, b, simd_shuffle!(c, c, [LANE as u32; 4])) }
 }
 #[doc = "Multiply-subtract long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_laneq_u32)"]
@@ -14670,13 +14476,7 @@ pub fn vmlsl_high_laneq_u32<const LANE: i32>(
     c: uint32x4_t,
 ) -> uint64x2_t {
     static_assert_uimm_bits!(LANE, 2);
-    unsafe {
-        vmlsl_high_u32(
-            a,
-            b,
-            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
-        )
-    }
+    unsafe { vmlsl_high_u32(a, b, simd_shuffle!(c, c, [LANE as u32; 4])) }
 }
 #[doc = "Multiply-subtract long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_n_s16)"]
@@ -14975,12 +14775,7 @@ pub fn vmul_lane_f64<const LANE: i32>(a: float64x1_t, b: float64x1_t) -> float64
 #[cfg(not(target_arch = "arm64ec"))]
 pub fn vmul_laneq_f16<const LANE: i32>(a: float16x4_t, b: float16x8_t) -> float16x4_t {
     static_assert_uimm_bits!(LANE, 3);
-    unsafe {
-        simd_mul(
-            a,
-            simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
-        )
-    }
+    unsafe { simd_mul(a, simd_shuffle!(b, b, [LANE as u32; 4])) }
 }
 #[doc = "Floating-point multiply"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_laneq_f16)"]
@@ -14992,25 +14787,7 @@ pub fn vmul_laneq_f16<const LANE: i32>(a: float16x4_t, b: float16x8_t) -> float1
 #[cfg(not(target_arch = "arm64ec"))]
 pub fn vmulq_laneq_f16<const LANE: i32>(a: float16x8_t, b: float16x8_t) -> float16x8_t {
     static_assert_uimm_bits!(LANE, 3);
-    unsafe {
-        simd_mul(
-            a,
-            simd_shuffle!(
-                b,
-                b,
-                [
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32
-                ]
-            ),
-        )
-    }
+    unsafe { simd_mul(a, simd_shuffle!(b, b, [LANE as u32; 8])) }
 }
 #[doc = "Floating-point multiply"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_laneq_f64)"]
@@ -15104,25 +14881,7 @@ pub fn vmulh_laneq_f16<const LANE: i32>(a: f16, b: float16x8_t) -> f16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vmull_high_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x4_t) -> int32x4_t {
     static_assert_uimm_bits!(LANE, 2);
-    unsafe {
-        vmull_high_s16(
-            a,
-            simd_shuffle!(
-                b,
-                b,
-                [
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32
-                ]
-            ),
-        )
-    }
+    unsafe { vmull_high_s16(a, simd_shuffle!(b, b, [LANE as u32; 8])) }
 }
 #[doc = "Multiply long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_laneq_s16)"]
@@ -15133,25 +14892,7 @@ pub fn vmull_high_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x4_t) -> int32
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vmull_high_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t) -> int32x4_t {
     static_assert_uimm_bits!(LANE, 3);
-    unsafe {
-        vmull_high_s16(
-            a,
-            simd_shuffle!(
-                b,
-                b,
-                [
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32
-                ]
-            ),
-        )
-    }
+    unsafe { vmull_high_s16(a, simd_shuffle!(b, b, [LANE as u32; 8])) }
 }
 #[doc = "Multiply long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_lane_s32)"]
@@ -15162,12 +14903,7 @@ pub fn vmull_high_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t) -> int3
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vmull_high_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x2_t) -> int64x2_t {
     static_assert_uimm_bits!(LANE, 1);
-    unsafe {
-        vmull_high_s32(
-            a,
-            simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
-        )
-    }
+    unsafe { vmull_high_s32(a, simd_shuffle!(b, b, [LANE as u32; 4])) }
 }
 #[doc = "Multiply long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_laneq_s32)"]
@@ -15178,12 +14914,7 @@ pub fn vmull_high_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x2_t) -> int64
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vmull_high_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t) -> int64x2_t {
     static_assert_uimm_bits!(LANE, 2);
-    unsafe {
-        vmull_high_s32(
-            a,
-            simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
-        )
-    }
+    unsafe { vmull_high_s32(a, simd_shuffle!(b, b, [LANE as u32; 4])) }
 }
 #[doc = "Multiply long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_lane_u16)"]
@@ -15194,25 +14925,7 @@ pub fn vmull_high_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t) -> int6
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vmull_high_lane_u16<const LANE: i32>(a: uint16x8_t, b: uint16x4_t) -> uint32x4_t {
     static_assert_uimm_bits!(LANE, 2);
-    unsafe {
-        vmull_high_u16(
-            a,
-            simd_shuffle!(
-                b,
-                b,
-                [
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32
-                ]
-            ),
-        )
-    }
+    unsafe { vmull_high_u16(a, simd_shuffle!(b, b, [LANE as u32; 8])) }
 }
 #[doc = "Multiply long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_laneq_u16)"]
@@ -15223,25 +14936,7 @@ pub fn vmull_high_lane_u16<const LANE: i32>(a: uint16x8_t, b: uint16x4_t) -> uin
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vmull_high_laneq_u16<const LANE: i32>(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t {
     static_assert_uimm_bits!(LANE, 3);
-    unsafe {
-        vmull_high_u16(
-            a,
-            simd_shuffle!(
-                b,
-                b,
-                [
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32
-                ]
-            ),
-        )
-    }
+    unsafe { vmull_high_u16(a, simd_shuffle!(b, b, [LANE as u32; 8])) }
 }
 #[doc = "Multiply long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_lane_u32)"]
@@ -15252,12 +14947,7 @@ pub fn vmull_high_laneq_u16<const LANE: i32>(a: uint16x8_t, b: uint16x8_t) -> ui
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vmull_high_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint32x2_t) -> uint64x2_t {
     static_assert_uimm_bits!(LANE, 1);
-    unsafe {
-        vmull_high_u32(
-            a,
-            simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
-        )
-    }
+    unsafe { vmull_high_u32(a, simd_shuffle!(b, b, [LANE as u32; 4])) }
 }
 #[doc = "Multiply long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_laneq_u32)"]
@@ -15268,12 +14958,7 @@ pub fn vmull_high_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint32x2_t) -> uin
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vmull_high_laneq_u32<const LANE: i32>(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t {
     static_assert_uimm_bits!(LANE, 2);
-    unsafe {
-        vmull_high_u32(
-            a,
-            simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
-        )
-    }
+    unsafe { vmull_high_u32(a, simd_shuffle!(b, b, [LANE as u32; 4])) }
 }
 #[doc = "Multiply long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_n_s16)"]
@@ -15436,7 +15121,7 @@ pub fn vmull_p64(a: p64, b: p64) -> p128 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vmulq_lane_f64<const LANE: i32>(a: float64x2_t, b: float64x1_t) -> float64x2_t {
     static_assert!(LANE == 0);
-    unsafe { simd_mul(a, simd_shuffle!(b, b, [LANE as u32, LANE as u32])) }
+    unsafe { simd_mul(a, simd_shuffle!(b, b, [LANE as u32; 2])) }
 }
 #[doc = "Floating-point multiply"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_laneq_f64)"]
@@ -15447,7 +15132,7 @@ pub fn vmulq_lane_f64<const LANE: i32>(a: float64x2_t, b: float64x1_t) -> float6
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vmulq_laneq_f64<const LANE: i32>(a: float64x2_t, b: float64x2_t) -> float64x2_t {
     static_assert_uimm_bits!(LANE, 1);
-    unsafe { simd_mul(a, simd_shuffle!(b, b, [LANE as u32, LANE as u32])) }
+    unsafe { simd_mul(a, simd_shuffle!(b, b, [LANE as u32; 2])) }
 }
 #[doc = "Floating-point multiply"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmuls_lane_f32)"]
@@ -15599,12 +15284,7 @@ pub fn vmulxq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
 #[cfg(not(target_arch = "arm64ec"))]
 pub fn vmulx_lane_f16<const LANE: i32>(a: float16x4_t, b: float16x4_t) -> float16x4_t {
     static_assert_uimm_bits!(LANE, 2);
-    unsafe {
-        vmulx_f16(
-            a,
-            simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
-        )
-    }
+    unsafe { vmulx_f16(a, simd_shuffle!(b, b, [LANE as u32; 4])) }
 }
 #[doc = "Floating-point multiply extended"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulx_laneq_f16)"]
@@ -15616,12 +15296,7 @@ pub fn vmulx_lane_f16<const LANE: i32>(a: float16x4_t, b: float16x4_t) -> float1
 #[cfg(not(target_arch = "arm64ec"))]
 pub fn vmulx_laneq_f16<const LANE: i32>(a: float16x4_t, b: float16x8_t) -> float16x4_t {
     static_assert_uimm_bits!(LANE, 3);
-    unsafe {
-        vmulx_f16(
-            a,
-            simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
-        )
-    }
+    unsafe { vmulx_f16(a, simd_shuffle!(b, b, [LANE as u32; 4])) }
 }
 #[doc = "Floating-point multiply extended"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulxq_lane_f16)"]
@@ -15633,25 +15308,7 @@ pub fn vmulx_laneq_f16<const LANE: i32>(a: float16x4_t, b: float16x8_t) -> float
 #[cfg(not(target_arch = "arm64ec"))]
 pub fn vmulxq_lane_f16<const LANE: i32>(a: float16x8_t, b: float16x4_t) -> float16x8_t {
     static_assert_uimm_bits!(LANE, 2);
-    unsafe {
-        vmulxq_f16(
-            a,
-            simd_shuffle!(
-                b,
-                b,
-                [
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32
-                ]
-            ),
-        )
-    }
+    unsafe { vmulxq_f16(a, simd_shuffle!(b, b, [LANE as u32; 8])) }
 }
 #[doc = "Floating-point multiply extended"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulxq_laneq_f16)"]
@@ -15663,25 +15320,7 @@ pub fn vmulxq_lane_f16<const LANE: i32>(a: float16x8_t, b: float16x4_t) -> float
 #[cfg(not(target_arch = "arm64ec"))]
 pub fn vmulxq_laneq_f16<const LANE: i32>(a: float16x8_t, b: float16x8_t) -> float16x8_t {
     static_assert_uimm_bits!(LANE, 3);
-    unsafe {
-        vmulxq_f16(
-            a,
-            simd_shuffle!(
-                b,
-                b,
-                [
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32,
-                    LANE as u32
-                ]
-            ),
-        )
-    }
+    unsafe { vmulxq_f16(a, simd_shuffle!(b, b, [LANE as u32; 8])) }
 }
 #[doc = "Floating-point multiply extended"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulx_lane_f32)"]
@@ -15692,7 +15331,7 @@ pub fn vmulxq_laneq_f16<const LANE: i32>(a: float16x8_t, b: float16x8_t) -> floa
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vmulx_lane_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t) -> float32x2_t {
     static_assert_uimm_bits!(LANE, 1);
-    unsafe { vmulx_f32(a, simd_shuffle!(b, b, [LANE as u32, LANE as u32])) }
+    unsafe { vmulx_f32(a, simd_shuffle!(b, b, [LANE as u32; 2])) }
 }
 #[doc = "Floating-point multiply extended"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulx_laneq_f32)"]
@@ -15703,7 +15342,7 @@ pub fn vmulx_lane_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t) -> float3
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vmulx_laneq_f32<const LANE: i32>(a: float32x2_t, b: float32x4_t) -> float32x2_t {
     static_assert_uimm_bits!(LANE, 2);
-    unsafe { vmulx_f32(a, simd_shuffle!(b, b, [LANE as u32, LANE as u32])) }
+    unsafe { vmulx_f32(a, simd_shuffle!(b, b, [LANE as u32; 2])) }
 }
 #[doc = "Floating-point multiply extended"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulxq_lane_f32)"]
@@ -15714,12 +15353,7 @@ pub fn vmulx_laneq_f32<const LANE: i32>(a: float32x2_t, b: float32x4_t) -> float
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vmulxq_lane_f32<const LANE: i32>(a: float32x4_t, b: float32x2_t) -> float32x4_t {
     static_assert_uimm_bits!(LANE, 1);
-    unsafe {
-        vmulxq_f32(
-            a,
-            simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
-        )
-    }
+    unsafe { vmulxq_f32(a, simd_shuffle!(b, b, [LANE as u32; 4])) }
 }
 #[doc = "Floating-point multiply extended"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulxq_laneq_f32)"]
@@ -15730,12 +15364,7 @@ pub fn vmulxq_lane_f32<const LANE: i32>(a: float32x4_t, b: float32x2_t) -> float
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vmulxq_laneq_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t) -> float32x4_t {
     static_assert_uimm_bits!(LANE, 2);
-    unsafe {
-        vmulxq_f32(
-            a,
-            simd_shuffle!(b, b, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]),
-        )
-    }
+    unsafe { vmulxq_f32(a, simd_shuffle!(b, b, [LANE as u32; 4])) }
 }
 #[doc = "Floating-point multiply extended"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulxq_laneq_f64)"]
@@ -15746,7 +15375,7 @@ pub fn vmulxq_laneq_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t) -> floa
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vmulxq_laneq_f64<const LANE: i32>(a: float64x2_t, b: float64x2_t) -> float64x2_t {
     static_assert_uimm_bits!(LANE, 1);
-    unsafe { vmulxq_f64(a, simd_shuffle!(b, b, [LANE as u32, LANE as u32])) }
+    unsafe { vmulxq_f64(a, simd_shuffle!(b, b, [LANE as u32; 2])) }
 }
 #[doc = "Floating-point multiply extended"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulx_lane_f64)"]
@@ -15916,7 +15545,7 @@ pub fn vmulxh_laneq_f16<const LANE: i32>(a: f16, b: float16x8_t) -> f16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vmulxq_lane_f64<const LANE: i32>(a: float64x2_t, b: float64x1_t) -> float64x2_t {
     static_assert!(LANE == 0);
-    unsafe { vmulxq_f64(a, simd_shuffle!(b, b, [LANE as u32, LANE as u32])) }
+    unsafe { vmulxq_f64(a, simd_shuffle!(b, b, [LANE as u32; 2])) }
 }
 #[doc = "Negate"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vneg_f64)"]
@@ -17916,8 +17545,7 @@ pub fn vqnegd_s64(a: i64) -> i64 {
 pub fn vqrdmlah_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
     static_assert_uimm_bits!(LANE, 2);
     unsafe {
-        let c: int16x4_t =
-            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+        let c: int16x4_t = simd_shuffle!(c, c, [LANE as u32; 4]);
         vqrdmlah_s16(a, b, c)
     }
 }
@@ -17931,7 +17559,7 @@ pub fn vqrdmlah_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x4
 pub fn vqrdmlah_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
     static_assert_uimm_bits!(LANE, 1);
     unsafe {
-        let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
+        let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32; 2]);
         vqrdmlah_s32(a, b, c)
     }
 }
@@ -17945,8 +17573,7 @@ pub fn vqrdmlah_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x2
 pub fn vqrdmlah_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x8_t) -> int16x4_t {
     static_assert_uimm_bits!(LANE, 3);
     unsafe {
-        let c: int16x4_t =
-            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+        let c: int16x4_t = simd_shuffle!(c, c, [LANE as u32; 4]);
         vqrdmlah_s16(a, b, c)
     }
 }
@@ -17960,7 +17587,7 @@ pub fn vqrdmlah_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x
 pub fn vqrdmlah_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x4_t) -> int32x2_t {
     static_assert_uimm_bits!(LANE, 2);
     unsafe {
-        let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
+        let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32; 2]);
         vqrdmlah_s32(a, b, c)
     }
 }
@@ -17974,20 +17601,7 @@ pub fn vqrdmlah_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x
 pub fn vqrdmlahq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x4_t) -> int16x8_t {
     static_assert_uimm_bits!(LANE, 2);
     unsafe {
-        let c: int16x8_t = simd_shuffle!(
-            c,
-            c,
-            [
-                LANE as u32,
-                LANE as u32,
-                LANE as u32,
-                LANE as u32,
-                LANE as u32,
-                LANE as u32,
-                LANE as u32,
-                LANE as u32
-            ]
-        );
+        let c: int16x8_t = simd_shuffle!(c, c, [LANE as u32; 8]);
         vqrdmlahq_s16(a, b, c)
     }
 }
@@ -18001,8 +17615,7 @@ pub fn vqrdmlahq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x
 pub fn vqrdmlahq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x2_t) -> int32x4_t {
     static_assert_uimm_bits!(LANE, 1);
     unsafe {
-        let c: int32x4_t =
-            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+        let c: int32x4_t = simd_shuffle!(c, c, [LANE as u32; 4]);
         vqrdmlahq_s32(a, b, c)
     }
 }
@@ -18016,20 +17629,7 @@ pub fn vqrdmlahq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x
 pub fn vqrdmlahq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
     static_assert_uimm_bits!(LANE, 3);
     unsafe {
-        let c: int16x8_t = simd_shuffle!(
-            c,
-            c,
-            [
-                LANE as u32,
-                LANE as u32,
-                LANE as u32,
-                LANE as u32,
-                LANE as u32,
-                LANE as u32,
-                LANE as u32,
-                LANE as u32
-            ]
-        );
+        let c: int16x8_t = simd_shuffle!(c, c, [LANE as u32; 8]);
         vqrdmlahq_s16(a, b, c)
     }
 }
@@ -18043,8 +17643,7 @@ pub fn vqrdmlahq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16
 pub fn vqrdmlahq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
     static_assert_uimm_bits!(LANE, 2);
     unsafe {
-        let c: int32x4_t =
-            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+        let c: int32x4_t = simd_shuffle!(c, c, [LANE as u32; 4]);
         vqrdmlahq_s32(a, b, c)
     }
 }
@@ -18190,8 +17789,7 @@ pub fn vqrdmlahs_s32(a: i32, b: i32, c: i32) -> i32 {
 pub fn vqrdmlsh_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
     static_assert_uimm_bits!(LANE, 2);
     unsafe {
-        let c: int16x4_t =
-            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+        let c: int16x4_t = simd_shuffle!(c, c, [LANE as u32; 4]);
         vqrdmlsh_s16(a, b, c)
     }
 }
@@ -18205,7 +17803,7 @@ pub fn vqrdmlsh_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x4
 pub fn vqrdmlsh_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
     static_assert_uimm_bits!(LANE, 1);
     unsafe {
-        let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
+        let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32; 2]);
         vqrdmlsh_s32(a, b, c)
     }
 }
@@ -18219,8 +17817,7 @@ pub fn vqrdmlsh_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x2
 pub fn vqrdmlsh_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x8_t) -> int16x4_t {
     static_assert_uimm_bits!(LANE, 3);
     unsafe {
-        let c: int16x4_t =
-            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+        let c: int16x4_t = simd_shuffle!(c, c, [LANE as u32; 4]);
         vqrdmlsh_s16(a, b, c)
     }
 }
@@ -18234,7 +17831,7 @@ pub fn vqrdmlsh_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x
 pub fn vqrdmlsh_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x4_t) -> int32x2_t {
     static_assert_uimm_bits!(LANE, 2);
     unsafe {
-        let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
+        let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32; 2]);
         vqrdmlsh_s32(a, b, c)
     }
 }
@@ -18248,20 +17845,7 @@ pub fn vqrdmlsh_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x
 pub fn vqrdmlshq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x4_t) -> int16x8_t {
     static_assert_uimm_bits!(LANE, 2);
     unsafe {
-        let c: int16x8_t = simd_shuffle!(
-            c,
-            c,
-            [
-                LANE as u32,
-                LANE as u32,
-                LANE as u32,
-                LANE as u32,
-                LANE as u32,
-                LANE as u32,
-                LANE as u32,
-                LANE as u32
-            ]
-        );
+        let c: int16x8_t = simd_shuffle!(c, c, [LANE as u32; 8]);
         vqrdmlshq_s16(a, b, c)
     }
 }
@@ -18275,8 +17859,7 @@ pub fn vqrdmlshq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x
 pub fn vqrdmlshq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x2_t) -> int32x4_t {
     static_assert_uimm_bits!(LANE, 1);
     unsafe {
-        let c: int32x4_t =
-            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+        let c: int32x4_t = simd_shuffle!(c, c, [LANE as u32; 4]);
         vqrdmlshq_s32(a, b, c)
     }
 }
@@ -18290,20 +17873,7 @@ pub fn vqrdmlshq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x
 pub fn vqrdmlshq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
     static_assert_uimm_bits!(LANE, 3);
     unsafe {
-        let c: int16x8_t = simd_shuffle!(
-            c,
-            c,
-            [
-                LANE as u32,
-                LANE as u32,
-                LANE as u32,
-                LANE as u32,
-                LANE as u32,
-                LANE as u32,
-                LANE as u32,
-                LANE as u32
-            ]
-        );
+        let c: int16x8_t = simd_shuffle!(c, c, [LANE as u32; 8]);
         vqrdmlshq_s16(a, b, c)
     }
 }
@@ -18317,8 +17887,7 @@ pub fn vqrdmlshq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16
 pub fn vqrdmlshq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
     static_assert_uimm_bits!(LANE, 2);
     unsafe {
-        let c: int32x4_t =
-            simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+        let c: int32x4_t = simd_shuffle!(c, c, [LANE as u32; 4]);
         vqrdmlshq_s32(a, b, c)
     }
 }
diff --git a/library/stdarch/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml b/library/stdarch/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml
index 0ec8024fdfbb6..8574aacee6671 100644
--- a/library/stdarch/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml
+++ b/library/stdarch/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml
@@ -5374,7 +5374,7 @@ intrinsics:
     static_defs: ["const LANE: i32"]
     safety: safe
     types:
-      - ["q_lane_f64", float64x2_t, float64x1_t, "q_f64", '[LANE as u32, LANE as u32]']
+      - ["q_lane_f64", float64x2_t, float64x1_t, "q_f64", '[LANE as u32; 2]']
     compose:
       - FnCall: [static_assert!, ['LANE == 0']]
       - FnCall:
@@ -5443,11 +5443,11 @@ intrinsics:
     static_defs: ["const LANE: i32"]
     safety: safe
     types:
-      - ['_lane_f32', float32x2_t, float32x2_t, '1', '_f32', '[LANE as u32, LANE as u32]']
-      - ['_laneq_f32', float32x2_t, float32x4_t, '2', '_f32', '[LANE as u32, LANE as u32]']
-      - ['q_lane_f32', float32x4_t, float32x2_t, '1', 'q_f32', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
-      - ['q_laneq_f32', float32x4_t, float32x4_t, '2', 'q_f32', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
-      - ['q_laneq_f64', float64x2_t, float64x2_t, '1', 'q_f64', '[LANE as u32, LANE as u32]']
+      - ['_lane_f32', float32x2_t, float32x2_t, '1', '_f32', '[LANE as u32; 2]']
+      - ['_laneq_f32', float32x2_t, float32x4_t, '2', '_f32', '[LANE as u32; 2]']
+      - ['q_lane_f32', float32x4_t, float32x2_t, '1', 'q_f32', '[LANE as u32; 4]']
+      - ['q_laneq_f32', float32x4_t, float32x4_t, '2', 'q_f32', '[LANE as u32; 4]']
+      - ['q_laneq_f64', float64x2_t, float64x2_t, '1', 'q_f64', '[LANE as u32; 2]']
     compose:
       - FnCall: [static_assert_uimm_bits!, ['LANE', "{type[3]}"]]
       - FnCall:
@@ -5473,10 +5473,10 @@ intrinsics:
     static_defs: ["const LANE: i32"]
     safety: safe
     types:
-      - ['_lane_f16', float16x4_t, float16x4_t, '2', '_f16', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
-      - ['_laneq_f16', float16x4_t, float16x8_t, '3', '_f16', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
-      - ['q_lane_f16', float16x8_t, float16x4_t, '2', 'q_f16', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
-      - ['q_laneq_f16', float16x8_t, float16x8_t, '3', 'q_f16', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - ['_lane_f16', float16x4_t, float16x4_t, '2', '_f16', '[LANE as u32; 4]']
+      - ['_laneq_f16', float16x4_t, float16x8_t, '3', '_f16', '[LANE as u32; 4]']
+      - ['q_lane_f16', float16x8_t, float16x4_t, '2', 'q_f16', '[LANE as u32; 8]']
+      - ['q_laneq_f16', float16x8_t, float16x8_t, '3', 'q_f16', '[LANE as u32; 8]']
     compose:
       - FnCall: [static_assert_uimm_bits!, ['LANE', "{type[3]}"]]
       - FnCall:
@@ -7755,14 +7755,14 @@ intrinsics:
     static_defs: ['const LANE: i32']
     safety: safe
     types:
-      - [_lane_s16, int16x4_t, int16x4_t, int16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
-      - [_laneq_s16, int16x4_t, int16x4_t, int16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
-      - [q_lane_s16, int16x8_t, int16x8_t, int16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
-      - [q_laneq_s16, int16x8_t, int16x8_t, int16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
-      - [_lane_s32, int32x2_t, int32x2_t, int32x2_t, '1', '[LANE as u32, LANE as u32]']
-      - [_laneq_s32, int32x2_t, int32x2_t, int32x4_t, '2', '[LANE as u32, LANE as u32]']
-      - [q_lane_s32, int32x4_t, int32x4_t, int32x2_t, '1', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
-      - [q_laneq_s32, int32x4_t, int32x4_t, int32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [_lane_s16, int16x4_t, int16x4_t, int16x4_t, '2', '[LANE as u32; 4]']
+      - [_laneq_s16, int16x4_t, int16x4_t, int16x8_t, '3', '[LANE as u32; 4]']
+      - [q_lane_s16, int16x8_t, int16x8_t, int16x4_t, '2', '[LANE as u32; 8]']
+      - [q_laneq_s16, int16x8_t, int16x8_t, int16x8_t, '3', '[LANE as u32; 8]']
+      - [_lane_s32, int32x2_t, int32x2_t, int32x2_t, '1', '[LANE as u32; 2]']
+      - [_laneq_s32, int32x2_t, int32x2_t, int32x4_t, '2', '[LANE as u32; 2]']
+      - [q_lane_s32, int32x4_t, int32x4_t, int32x2_t, '1', '[LANE as u32; 4]']
+      - [q_laneq_s32, int32x4_t, int32x4_t, int32x4_t, '2', '[LANE as u32; 4]']
     compose:
       - FnCall: [static_assert_uimm_bits!, [LANE, '{type[4]}']]
       - Let: [c, "{type[1]}", {FnCall: [simd_shuffle!, [c, c, "{type[5]}"]]}]
@@ -7839,14 +7839,14 @@ intrinsics:
     static_defs: ['const LANE: i32']
     safety: safe
     types:
-      - [_lane_s16, int16x4_t, int16x4_t, int16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
-      - [_laneq_s16, int16x4_t, int16x4_t, int16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
-      - [q_lane_s16, int16x8_t, int16x8_t, int16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
-      - [q_laneq_s16, int16x8_t, int16x8_t, int16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
-      - [_lane_s32, int32x2_t, int32x2_t, int32x2_t, '1', '[LANE as u32, LANE as u32]']
-      - [_laneq_s32, int32x2_t, int32x2_t, int32x4_t, '2', '[LANE as u32, LANE as u32]']
-      - [q_lane_s32, int32x4_t, int32x4_t, int32x2_t, '1', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
-      - [q_laneq_s32, int32x4_t, int32x4_t, int32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [_lane_s16, int16x4_t, int16x4_t, int16x4_t, '2', '[LANE as u32; 4]']
+      - [_laneq_s16, int16x4_t, int16x4_t, int16x8_t, '3', '[LANE as u32; 4]']
+      - [q_lane_s16, int16x8_t, int16x8_t, int16x4_t, '2', '[LANE as u32; 8]']
+      - [q_laneq_s16, int16x8_t, int16x8_t, int16x8_t, '3', '[LANE as u32; 8]']
+      - [_lane_s32, int32x2_t, int32x2_t, int32x2_t, '1', '[LANE as u32; 2]']
+      - [_laneq_s32, int32x2_t, int32x2_t, int32x4_t, '2', '[LANE as u32; 2]']
+      - [q_lane_s32, int32x4_t, int32x4_t, int32x2_t, '1', '[LANE as u32; 4]']
+      - [q_laneq_s32, int32x4_t, int32x4_t, int32x4_t, '2', '[LANE as u32; 4]']
     compose:
       - FnCall: [static_assert_uimm_bits!, [LANE, '{type[4]}']]
       - Let: [c, "{type[1]}", {FnCall: [simd_shuffle!, [c, c, "{type[5]}"]]}]
@@ -11138,7 +11138,7 @@ intrinsics:
       - FnCall:
           - simd_mul
           - - a
-            - FnCall: ["simd_shuffle!", [b, b, '[LANE as u32, LANE as u32]']]
+            - FnCall: ["simd_shuffle!", [b, b, '[LANE as u32; 2]']]
 
   - name: "vmuld_lane_f64"
     doc: "Floating-point multiply"
@@ -11195,7 +11195,7 @@ intrinsics:
       - FnCall:
           - simd_mul
           - - a
-            - FnCall: [simd_shuffle!, [b, b, '[LANE as u32, LANE as u32]']]
+            - FnCall: [simd_shuffle!, [b, b, '[LANE as u32; 2]']]
 
 
   # vmulq_laneq_f16
@@ -11212,8 +11212,8 @@ intrinsics:
     static_defs: ['const LANE: i32']
     safety: safe
     types:
-      - [float16x4_t, float16x8_t, '_lane', "[LANE as u32, LANE as u32, LANE as u32, LANE as u32]"]
-      - [float16x8_t, float16x8_t, 'q_lane', "[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]"]
+      - [float16x4_t, float16x8_t, '_lane', "[LANE as u32; 4]"]
+      - [float16x8_t, float16x8_t, 'q_lane', "[LANE as u32; 8]"]
     compose:
       - FnCall: [static_assert_uimm_bits!, [LANE, '3']]
       - FnCall:
@@ -11335,10 +11335,10 @@ intrinsics:
     static_defs: ['const LANE: i32']
     safety: safe
     types:
-      - [int32x4_t, int16x8_t, int16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
-      - [int32x4_t, int16x8_t, int16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
-      - [int64x2_t, int32x4_t, int32x2_t, '1', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
-      - [int64x2_t, int32x4_t, int32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [int32x4_t, int16x8_t, int16x4_t, '2', '[LANE as u32; 8]']
+      - [int32x4_t, int16x8_t, int16x8_t, '3', '[LANE as u32; 8]']
+      - [int64x2_t, int32x4_t, int32x2_t, '1', '[LANE as u32; 4]']
+      - [int64x2_t, int32x4_t, int32x4_t, '2', '[LANE as u32; 4]']
     compose:
       - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']]
       - FnCall:
@@ -11358,10 +11358,10 @@ intrinsics:
     static_defs: ['const LANE: i32']
     safety: safe
     types:
-      - [uint32x4_t, uint16x8_t, uint16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
-      - [uint32x4_t, uint16x8_t, uint16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
-      - [uint64x2_t, uint32x4_t, uint32x2_t, '1', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
-      - [uint64x2_t, uint32x4_t, uint32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [uint32x4_t, uint16x8_t, uint16x4_t, '2', '[LANE as u32; 8]']
+      - [uint32x4_t, uint16x8_t, uint16x8_t, '3', '[LANE as u32; 8]']
+      - [uint64x2_t, uint32x4_t, uint32x2_t, '1', '[LANE as u32; 4]']
+      - [uint64x2_t, uint32x4_t, uint32x4_t, '2', '[LANE as u32; 4]']
     compose:
       - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']]
       - FnCall:
@@ -11660,10 +11660,10 @@ intrinsics:
     static_defs: ['const LANE: i32']
     safety: safe
     types:
-      - [int16x8_t, int16x4_t, int32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
-      - [int16x8_t, int16x8_t, int32x4_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
-      - [int32x4_t, int32x2_t, int64x2_t, '1', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
-      - [int32x4_t, int32x4_t, int64x2_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [int16x8_t, int16x4_t, int32x4_t, '2', '[LANE as u32; 8]']
+      - [int16x8_t, int16x8_t, int32x4_t, '3', '[LANE as u32; 8]']
+      - [int32x4_t, int32x2_t, int64x2_t, '1', '[LANE as u32; 4]']
+      - [int32x4_t, int32x4_t, int64x2_t, '2', '[LANE as u32; 4]']
     compose:
       - FnCall: [static_assert_uimm_bits!, [LANE, "{type[3]}"]]
       - FnCall:
@@ -11682,10 +11682,10 @@ intrinsics:
     static_defs: ['const LANE: i32']
     safety: safe
     types:
-      - [uint16x8_t, uint16x4_t, uint32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
-      - [uint16x8_t, uint16x8_t, uint32x4_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
-      - [uint32x4_t, uint32x2_t, uint64x2_t, '1', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
-      - [uint32x4_t, uint32x4_t, uint64x2_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [uint16x8_t, uint16x4_t, uint32x4_t, '2', '[LANE as u32; 8]']
+      - [uint16x8_t, uint16x8_t, uint32x4_t, '3', '[LANE as u32; 8]']
+      - [uint32x4_t, uint32x2_t, uint64x2_t, '1', '[LANE as u32; 4]']
+      - [uint32x4_t, uint32x4_t, uint64x2_t, '2', '[LANE as u32; 4]']
     compose:
       - FnCall: [static_assert_uimm_bits!, [LANE, "{type[3]}"]]
       - FnCall:
@@ -11973,10 +11973,10 @@ intrinsics:
     static_defs: ['const LANE: i32']
     safety: safe
     types:
-      - [int32x4_t, int16x8_t, int16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
-      - [int32x4_t, int16x8_t, int16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
-      - [int64x2_t, int32x4_t, int32x2_t, '1', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
-      - [int64x2_t, int32x4_t, int32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [int32x4_t, int16x8_t, int16x4_t, '2', '[LANE as u32; 8]']
+      - [int32x4_t, int16x8_t, int16x8_t, '3', '[LANE as u32; 8]']
+      - [int64x2_t, int32x4_t, int32x2_t, '1', '[LANE as u32; 4]']
+      - [int64x2_t, int32x4_t, int32x4_t, '2', '[LANE as u32; 4]']
     compose:
       - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']]
       - FnCall: ['vmlal_high_{neon_type[2]}', [a, b, {FnCall: [simd_shuffle!, [c, c, '{type[4]}']]}]]
@@ -11992,10 +11992,10 @@ intrinsics:
     static_defs: ['const LANE: i32']
     safety: safe
     types:
-      - [uint32x4_t, uint16x8_t, uint16x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
-      - [uint32x4_t, uint16x8_t, uint16x8_t, '3', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
-      - [uint64x2_t, uint32x4_t, uint32x2_t, '1', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
-      - [uint64x2_t, uint32x4_t, uint32x4_t, '2', '[LANE as u32, LANE as u32, LANE as u32, LANE as u32]']
+      - [uint32x4_t, uint16x8_t, uint16x4_t, '2', '[LANE as u32; 8]']
+      - [uint32x4_t, uint16x8_t, uint16x8_t, '3', '[LANE as u32; 8]']
+      - [uint64x2_t, uint32x4_t, uint32x2_t, '1', '[LANE as u32; 4]']
+      - [uint64x2_t, uint32x4_t, uint32x4_t, '2', '[LANE as u32; 4]']
     compose:
       - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']]
       - FnCall: ['vmlal_high_{neon_type[2]}', [a, b, {FnCall: [simd_shuffle!, [c, c, '{type[4]}']]}]]

From 820ca4803853f80be0dbd29e31a82f623107e769 Mon Sep 17 00:00:00 2001
From: sayantn <sayantn05@gmail.com>
Date: Wed, 25 Feb 2026 04:49:15 +0530
Subject: [PATCH 15/19] Update Intel SDE version to 10.5

---
 library/stdarch/ci/docker/x86_64-unknown-linux-gnu/Dockerfile | 2 +-
 library/stdarch/ci/docker/x86_64-unknown-linux-gnu/cpuid.def  | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/library/stdarch/ci/docker/x86_64-unknown-linux-gnu/Dockerfile b/library/stdarch/ci/docker/x86_64-unknown-linux-gnu/Dockerfile
index 2743896375cf3..a357449d51e3d 100644
--- a/library/stdarch/ci/docker/x86_64-unknown-linux-gnu/Dockerfile
+++ b/library/stdarch/ci/docker/x86_64-unknown-linux-gnu/Dockerfile
@@ -12,7 +12,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
   build-essential \
   lld
 
-RUN wget http://ci-mirrors.rust-lang.org/stdarch/sde-external-9.58.0-2025-06-16-lin.tar.xz -O sde.tar.xz
+RUN wget http://ci-mirrors.rust-lang.org/stdarch/sde-external-10.5.0-2026-01-13-lin.tar.xz -O sde.tar.xz
 RUN mkdir intel-sde
 RUN tar -xJf sde.tar.xz --strip-components=1 -C intel-sde
 ENV CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER="/intel-sde/sde64 \
diff --git a/library/stdarch/ci/docker/x86_64-unknown-linux-gnu/cpuid.def b/library/stdarch/ci/docker/x86_64-unknown-linux-gnu/cpuid.def
index 342f7d83a63e3..acf023ed0dc49 100644
--- a/library/stdarch/ci/docker/x86_64-unknown-linux-gnu/cpuid.def
+++ b/library/stdarch/ci/docker/x86_64-unknown-linux-gnu/cpuid.def
@@ -12,7 +12,7 @@
 # CPUID_VERSION = 1.0
 #      Input      =>               Output
 # EAX      ECX    =>   EAX      EBX      ECX      EDX
-00000000 ******** => 00000024 756e6547 6c65746e 49656e69
+00000000 ******** => 00000029 756e6547 6c65746e 49656e69
 00000001 ******** => 00400f10 00100800 7ffaf3ff bfebfbff
 00000002 ******** => 76035a01 00f0b6ff 00000000 00c10000
 00000003 ******** => 00000000 00000000 00000000 00000000
@@ -48,6 +48,7 @@
 0000001e 00000001 => 000001ff 00000000 00000000 00000000
 00000024 00000000 => 00000001 00070002 00000000 00000000 #AVX10
 00000024 00000001 => 00000000 00000000 00000004 00000000
+00000029 ******** => 00000000 00000001 00000000 00000000
 80000000 ******** => 80000008 00000000 00000000 00000000
 80000001 ******** => 00000000 00000000 00000121 2c100000
 80000002 ******** => 00000000 00000000 00000000 00000000

From f01bcce3e22cf232d25db92335e6fbaa3f6daaf1 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert@folkertdev.nl>
Date: Wed, 25 Feb 2026 15:00:04 +0100
Subject: [PATCH 16/19] update to `resolver = 3`

---
 library/stdarch/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/library/stdarch/Cargo.toml b/library/stdarch/Cargo.toml
index 5979096439118..e3963a69879a1 100644
--- a/library/stdarch/Cargo.toml
+++ b/library/stdarch/Cargo.toml
@@ -1,5 +1,5 @@
 [workspace]
-resolver = "1"
+resolver = "3"
 members = [
   "crates/*",
   "examples",

From cb3046e5f2f0736366c0fea4977a8df579d96311 Mon Sep 17 00:00:00 2001
From: ArunTamil21 <aruntamiln@gmail.com>
Date: Wed, 25 Feb 2026 23:56:42 +0000
Subject: [PATCH 17/19] Add missing runtime test for _mm_comige_ss and fix
 _mm_comigt_ss test

---
 .../stdarch/crates/core_arch/src/x86/sse.rs   | 22 +++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/library/stdarch/crates/core_arch/src/x86/sse.rs b/library/stdarch/crates/core_arch/src/x86/sse.rs
index 2c4439a3f3a55..3f7781cc7dc4c 100644
--- a/library/stdarch/crates/core_arch/src/x86/sse.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse.rs
@@ -2816,14 +2816,32 @@ mod tests {
         let aa = &[3.0f32, 12.0, 23.0, NAN];
         let bb = &[3.0f32, 47.5, 1.5, NAN];
 
-        let ee = &[1i32, 0, 1, 0];
+        let ee = &[0i32, 0, 1, 0];
 
         for i in 0..4 {
             let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
             let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
 
-            let r = _mm_comige_ss(a, b);
+            let r = _mm_comigt_ss(a, b);
 
+            assert_eq!(
+                ee[i], r,
+                "_mm_comigt_ss({:?}, {:?}) = {}, expected: {} (i={})",
+                a, b, r, ee[i], i
+            );
+        }
+    }
+
+    #[simd_test(enable = "sse")]
+    fn test_mm_comige_ss() {
+        let aa = &[3.0f32, 23.0, 12.0, NAN];
+        let bb = &[3.0f32, 1.5, 47.5, NAN];
+        let ee = &[1i32, 1, 0, 0];
+
+        for i in 0..4 {
+            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
+            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
+            let r = _mm_comige_ss(a, b);
             assert_eq!(
                 ee[i], r,
                 "_mm_comige_ss({:?}, {:?}) = {}, expected: {} (i={})",

From f981ffe959bb8a8a99fa02b6864bb82d7b2d170a Mon Sep 17 00:00:00 2001
From: ArunTamil21 <aruntamiln@gmail.com>
Date: Thu, 26 Feb 2026 09:19:29 +0000
Subject: [PATCH 18/19] Remove _mm_comige_ss from skip list in x86-intel.rs

---
 library/stdarch/crates/stdarch-verify/tests/x86-intel.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/library/stdarch/crates/stdarch-verify/tests/x86-intel.rs b/library/stdarch/crates/stdarch-verify/tests/x86-intel.rs
index 4136463f197fd..2ac05e28cb4ce 100644
--- a/library/stdarch/crates/stdarch-verify/tests/x86-intel.rs
+++ b/library/stdarch/crates/stdarch-verify/tests/x86-intel.rs
@@ -246,7 +246,6 @@ fn verify_all_signatures() {
                 "_xend",
                 "_xabort_code",
                 // Aliases
-                "_mm_comige_ss",
                 "_mm_cvt_ss2si",
                 "_mm_cvtt_ss2si",
                 "_mm_cvt_si2ss",

From 2fa37e7b3498a9cb46004fae5fd9d6d40835ad28 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert@folkertdev.nl>
Date: Thu, 26 Feb 2026 13:00:21 +0100
Subject: [PATCH 19/19] aarch64: fix UB in non-power-of-two reads and writes

---
 .../stdarch/crates/core_arch/src/macros.rs    | 29 ++++++++++++++-----
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/library/stdarch/crates/core_arch/src/macros.rs b/library/stdarch/crates/core_arch/src/macros.rs
index 5a582fe17772b..00e92428b3e7e 100644
--- a/library/stdarch/crates/core_arch/src/macros.rs
+++ b/library/stdarch/crates/core_arch/src/macros.rs
@@ -237,12 +237,12 @@ macro_rules! deinterleaving_load {
     ($elem:ty, $lanes:literal, 2, $ptr:expr) => {{
         use $crate::core_arch::macros::deinterleave_mask;
         use $crate::core_arch::simd::Simd;
-        use $crate::{mem::transmute, ptr};
+        use $crate::mem::transmute;
 
         type V = Simd<$elem, $lanes>;
         type W = Simd<$elem, { $lanes * 2 }>;
 
-        let w: W = ptr::read_unaligned($ptr as *const W);
+        let w: W = $crate::ptr::read_unaligned($ptr as *const W);
 
         let v0: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 2, 0>());
         let v1: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 2, 1>());
@@ -253,12 +253,20 @@ macro_rules! deinterleaving_load {
     ($elem:ty, $lanes:literal, 3, $ptr:expr) => {{
         use $crate::core_arch::macros::deinterleave_mask;
         use $crate::core_arch::simd::Simd;
-        use $crate::{mem::transmute, ptr};
+        use $crate::mem::{MaybeUninit, transmute};
 
         type V = Simd<$elem, $lanes>;
         type W = Simd<$elem, { $lanes * 3 }>;
 
-        let w: W = ptr::read_unaligned($ptr as *const W);
+        // NOTE: repr(simd) adds padding to make the total size a power of two.
+        // Hence reading W from ptr might read out of bounds.
+        let mut mem = MaybeUninit::<W>::uninit();
+        $crate::ptr::copy_nonoverlapping(
+            $ptr.cast::<$elem>(),
+            mem.as_mut_ptr().cast::<$elem>(),
+            $lanes * 3,
+        );
+        let w = mem.assume_init();
 
         let v0: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 3, 0>());
         let v1: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 3, 1>());
@@ -270,12 +278,12 @@ macro_rules! deinterleaving_load {
     ($elem:ty, $lanes:literal, 4, $ptr:expr) => {{
         use $crate::core_arch::macros::deinterleave_mask;
         use $crate::core_arch::simd::Simd;
-        use $crate::{mem::transmute, ptr};
+        use $crate::mem::transmute;
 
         type V = Simd<$elem, $lanes>;
         type W = Simd<$elem, { $lanes * 4 }>;
 
-        let w: W = ptr::read_unaligned($ptr as *const W);
+        let w: W = $crate::ptr::read_unaligned($ptr as *const W);
 
         let v0: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 4, 0>());
         let v1: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 4, 1>());
@@ -322,8 +330,15 @@ macro_rules! interleaving_store {
             simd_shuffle!($v.2, $v.2, identity::<{ $lanes * 2 }>());
 
         type W = Simd<$elem, { $lanes * 3 }>;
+
+        // NOTE: repr(simd) adds padding to make the total size a power of two.
+        // Hence writing W to ptr might write out of bounds.
         let w: W = simd_shuffle!(v0v1, v2v2, interleave_mask::<{ $lanes * 3 }, $lanes, 3>());
-        $crate::ptr::write_unaligned($ptr as *mut W, w);
+        $crate::ptr::copy_nonoverlapping(
+            (&w as *const W).cast::<$elem>(),
+            $ptr.cast::<$elem>(),
+            $lanes * 3,
+        );
     }};
 
     // N = 4