diff --git a/compiler/rustc_errors/src/lib.rs b/compiler/rustc_errors/src/lib.rs
index f5f7618285e1f..75bb0e8e7b43f 100644
--- a/compiler/rustc_errors/src/lib.rs
+++ b/compiler/rustc_errors/src/lib.rs
@@ -589,7 +589,8 @@ struct DiagCtxtInner {
     /// add more information). All stashed diagnostics must be emitted with
     /// `emit_stashed_diagnostics` by the time the `DiagCtxtInner` is dropped,
     /// otherwise an assertion failure will occur.
-    stashed_diagnostics: FxIndexMap<(Span, StashKey), (DiagInner, Option<ErrorGuaranteed>)>,
+    stashed_diagnostics:
+        FxIndexMap<StashKey, FxIndexMap<Span, (DiagInner, Option<ErrorGuaranteed>)>>,
 
     future_breakage_diagnostics: Vec<DiagInner>,
 
@@ -912,8 +913,12 @@ impl<'a> DiagCtxtHandle<'a> {
         // FIXME(Centril, #69537): Consider reintroducing panic on overwriting a stashed diagnostic
         // if/when we have a more robust macro-friendly replacement for `(span, key)` as a key.
         // See the PR for a discussion.
-        let key = (span.with_parent(None), key);
-        self.inner.borrow_mut().stashed_diagnostics.insert(key, (diag, guar));
+        self.inner
+            .borrow_mut()
+            .stashed_diagnostics
+            .entry(key)
+            .or_default()
+            .insert(span.with_parent(None), (diag, guar));
 
         guar
     }
@@ -922,9 +927,10 @@ impl<'a> DiagCtxtHandle<'a> {
     /// and [`StashKey`] as the key. Panics if the found diagnostic is an
     /// error.
     pub fn steal_non_err(self, span: Span, key: StashKey) -> Option<Diag<'a, ()>> {
-        let key = (span.with_parent(None), key);
         // FIXME(#120456) - is `swap_remove` correct?
-        let (diag, guar) = self.inner.borrow_mut().stashed_diagnostics.swap_remove(&key)?;
+        let (diag, guar) = self.inner.borrow_mut().stashed_diagnostics.get_mut(&key).and_then(
+            |stashed_diagnostics| stashed_diagnostics.swap_remove(&span.with_parent(None)),
+        )?;
         assert!(!diag.is_error());
         assert!(guar.is_none());
         Some(Diag::new_diagnostic(self, diag))
@@ -943,9 +949,10 @@ impl<'a> DiagCtxtHandle<'a> {
     where
         F: FnMut(&mut Diag<'_>),
     {
-        let key = (span.with_parent(None), key);
         // FIXME(#120456) - is `swap_remove` correct?
-        let err = self.inner.borrow_mut().stashed_diagnostics.swap_remove(&key);
+        let err = self.inner.borrow_mut().stashed_diagnostics.get_mut(&key).and_then(
+            |stashed_diagnostics| stashed_diagnostics.swap_remove(&span.with_parent(None)),
+        );
         err.map(|(err, guar)| {
             // The use of `::<ErrorGuaranteed>` is safe because level is `Level::Error`.
             assert_eq!(err.level, Error);
@@ -966,9 +973,10 @@ impl<'a> DiagCtxtHandle<'a> {
         key: StashKey,
         new_err: Diag<'_>,
     ) -> ErrorGuaranteed {
-        let key = (span.with_parent(None), key);
         // FIXME(#120456) - is `swap_remove` correct?
-        let old_err = self.inner.borrow_mut().stashed_diagnostics.swap_remove(&key);
+        let old_err = self.inner.borrow_mut().stashed_diagnostics.get_mut(&key).and_then(
+            |stashed_diagnostics| stashed_diagnostics.swap_remove(&span.with_parent(None)),
+        );
         match old_err {
             Some((old_err, guar)) => {
                 assert_eq!(old_err.level, Error);
@@ -983,7 +991,14 @@ impl<'a> DiagCtxtHandle<'a> {
     }
 
     pub fn has_stashed_diagnostic(&self, span: Span, key: StashKey) -> bool {
-        self.inner.borrow().stashed_diagnostics.get(&(span.with_parent(None), key)).is_some()
+        let inner = self.inner.borrow();
+        if let Some(stashed_diagnostics) = inner.stashed_diagnostics.get(&key)
+            && !stashed_diagnostics.is_empty()
+        {
+            stashed_diagnostics.contains_key(&span.with_parent(None))
+        } else {
+            false
+        }
     }
 
     /// Emit all stashed diagnostics.
@@ -997,7 +1012,11 @@ impl<'a> DiagCtxtHandle<'a> {
         let inner = self.inner.borrow();
         inner.err_guars.len()
             + inner.lint_err_guars.len()
-            + inner.stashed_diagnostics.values().filter(|(_diag, guar)| guar.is_some()).count()
+            + inner
+                .stashed_diagnostics
+                .values()
+                .map(|a| a.values().filter(|(_, guar)| guar.is_some()).count())
+                .sum::<usize>()
     }
 
     /// This excludes lint errors and delayed bugs. Unless absolutely
@@ -1486,16 +1505,18 @@ impl DiagCtxtInner {
     fn emit_stashed_diagnostics(&mut self) -> Option<ErrorGuaranteed> {
         let mut guar = None;
         let has_errors = !self.err_guars.is_empty();
-        for (_, (diag, _guar)) in std::mem::take(&mut self.stashed_diagnostics).into_iter() {
-            if !diag.is_error() {
-                // Unless they're forced, don't flush stashed warnings when
-                // there are errors, to avoid causing warning overload. The
-                // stash would've been stolen already if it were important.
-                if !diag.is_force_warn() && has_errors {
-                    continue;
+        for (_, stashed_diagnostics) in std::mem::take(&mut self.stashed_diagnostics).into_iter() {
+            for (_, (diag, _guar)) in stashed_diagnostics {
+                if !diag.is_error() {
+                    // Unless they're forced, don't flush stashed warnings when
+                    // there are errors, to avoid causing warning overload. The
+                    // stash would've been stolen already if it were important.
+                    if !diag.is_force_warn() && has_errors {
+                        continue;
+                    }
                 }
+                guar = guar.or(self.emit_diagnostic(diag, None));
             }
-            guar = guar.or(self.emit_diagnostic(diag, None));
         }
         guar
     }
@@ -1688,6 +1709,7 @@ impl DiagCtxtInner {
             if let Some((_diag, guar)) = self
                 .stashed_diagnostics
                 .values()
+                .flat_map(|stashed_diagnostics| stashed_diagnostics.values())
                 .find(|(diag, guar)| guar.is_some() && diag.is_lint.is_none())
             {
                 *guar
@@ -1700,13 +1722,9 @@ impl DiagCtxtInner {
     fn has_errors(&self) -> Option<ErrorGuaranteed> {
         self.err_guars.get(0).copied().or_else(|| self.lint_err_guars.get(0).copied()).or_else(
             || {
-                if let Some((_diag, guar)) =
-                    self.stashed_diagnostics.values().find(|(_diag, guar)| guar.is_some())
-                {
-                    *guar
-                } else {
-                    None
-                }
+                self.stashed_diagnostics.values().find_map(|stashed_diagnostics| {
+                    stashed_diagnostics.values().find_map(|(_, guar)| *guar)
+                })
             },
         )
     }
diff --git a/compiler/rustc_hir_typeck/src/method/suggest.rs b/compiler/rustc_hir_typeck/src/method/suggest.rs
index 246b23f11b68b..8be4d55542d08 100644
--- a/compiler/rustc_hir_typeck/src/method/suggest.rs
+++ b/compiler/rustc_hir_typeck/src/method/suggest.rs
@@ -25,6 +25,7 @@ use rustc_middle::bug;
 use rustc_middle::ty::fast_reject::{DeepRejectCtxt, TreatParams, simplify_type};
 use rustc_middle::ty::print::{
     PrintTraitRefExt as _, with_crate_prefix, with_forced_trimmed_paths,
+    with_no_visible_paths_if_doc_hidden,
 };
 use rustc_middle::ty::{self, GenericArgKind, IsSuggestable, Ty, TyCtxt, TypeVisitableExt};
 use rustc_span::def_id::DefIdSet;
@@ -3328,7 +3329,9 @@ impl<'a, 'tcx> FnCtxt<'a, 'tcx> {
             let path_strings = candidates.iter().map(|trait_did| {
                 format!(
                     "{prefix}{}{postfix}\n",
-                    with_crate_prefix!(self.tcx.def_path_str(*trait_did)),
+                    with_no_visible_paths_if_doc_hidden!(with_crate_prefix!(
+                        self.tcx.def_path_str(*trait_did)
+                    )),
                 )
             });
 
@@ -3336,7 +3339,9 @@ impl<'a, 'tcx> FnCtxt<'a, 'tcx> {
                 let parent_did = parent_map.get(trait_did).unwrap();
                 format!(
                     "{prefix}{}::*{postfix} // trait {}\n",
-                    with_crate_prefix!(self.tcx.def_path_str(*parent_did)),
+                    with_no_visible_paths_if_doc_hidden!(with_crate_prefix!(
+                        self.tcx.def_path_str(*parent_did)
+                    )),
                     self.tcx.item_name(*trait_did),
                 )
             });
diff --git a/compiler/rustc_middle/src/ty/print/pretty.rs b/compiler/rustc_middle/src/ty/print/pretty.rs
index 0765d066fe9fa..667cc5c3f0eb7 100644
--- a/compiler/rustc_middle/src/ty/print/pretty.rs
+++ b/compiler/rustc_middle/src/ty/print/pretty.rs
@@ -63,6 +63,7 @@ thread_local! {
     static FORCE_TRIMMED_PATH: Cell<bool> = const { Cell::new(false) };
     static REDUCED_QUERIES: Cell<bool> = const { Cell::new(false) };
     static NO_VISIBLE_PATH: Cell<bool> = const { Cell::new(false) };
+    static NO_VISIBLE_PATH_IF_DOC_HIDDEN: Cell<bool> = const { Cell::new(false) };
     static RTN_MODE: Cell<RtnMode> = const { Cell::new(RtnMode::ForDiagnostic) };
 }
 
@@ -134,6 +135,8 @@ define_helper!(
     /// Prevent selection of visible paths. `Display` impl of DefId will prefer
     /// visible (public) reexports of types as paths.
     fn with_no_visible_paths(NoVisibleGuard, NO_VISIBLE_PATH);
+    /// Prevent selection of visible paths if the paths are through a doc hidden path.
+    fn with_no_visible_paths_if_doc_hidden(NoVisibleIfDocHiddenGuard, NO_VISIBLE_PATH_IF_DOC_HIDDEN);
 );
 
 #[must_use]
@@ -569,6 +572,10 @@ pub trait PrettyPrinter<'tcx>: Printer<'tcx> + fmt::Write {
             return Ok(false);
         };
 
+        if self.tcx().is_doc_hidden(visible_parent) && with_no_visible_paths_if_doc_hidden() {
+            return Ok(false);
+        }
+
         let actual_parent = self.tcx().opt_parent(def_id);
         debug!(
             "try_print_visible_def_path: visible_parent={:?} actual_parent={:?}",
diff --git a/library/core/src/array/mod.rs b/library/core/src/array/mod.rs
index 28329bb090845..efa7bed7c8e17 100644
--- a/library/core/src/array/mod.rs
+++ b/library/core/src/array/mod.rs
@@ -55,12 +55,16 @@ pub fn repeat<T: Clone, const N: usize>(val: T) -> [T; N] {
     from_trusted_iterator(repeat_n(val, N))
 }
 
-/// Creates an array of type [T; N], where each element `T` is the returned value from `cb`
-/// using that element's index.
+/// Creates an array where each element is produced by calling `f` with
+/// that element's index while walking forward through the array.
 ///
-/// # Arguments
+/// This is essentially the same as writing
+/// ```text
+/// [f(0), f(1), f(2), …, f(N - 2), f(N - 1)]
+/// ```
+/// and is similar to `(0..i).map(f)`, just for arrays not iterators.
 ///
-/// * `cb`: Callback where the passed argument is the current array index.
+/// If `N == 0`, this produces an empty array without ever calling `f`.
 ///
 /// # Example
 ///
@@ -82,13 +86,30 @@ pub fn repeat<T: Clone, const N: usize>(val: T) -> [T; N] {
 /// // indexes are:       0     1      2     3      4
 /// assert_eq!(bool_arr, [true, false, true, false, true]);
 /// ```
+///
+/// You can also capture things, for example to create an array full of clones
+/// where you can't just use `[item; N]` because it's not `Copy`:
+/// ```
+/// # // TBH `array::repeat` would be better for this, but it's not stable yet.
+/// let my_string = String::from("Hello");
+/// let clones: [String; 42] = std::array::from_fn(|_| my_string.clone());
+/// assert!(clones.iter().all(|x| *x == my_string));
+/// ```
+///
+/// The array is generated in ascending index order, starting from the front
+/// and going towards the back, so you can use closures with mutable state:
+/// ```
+/// let mut state = 1;
+/// let a = std::array::from_fn(|_| { let x = state; state *= 2; x });
+/// assert_eq!(a, [1, 2, 4, 8, 16, 32]);
+/// ```
 #[inline]
 #[stable(feature = "array_from_fn", since = "1.63.0")]
-pub fn from_fn<T, const N: usize, F>(cb: F) -> [T; N]
+pub fn from_fn<T, const N: usize, F>(f: F) -> [T; N]
 where
     F: FnMut(usize) -> T,
 {
-    try_from_fn(NeverShortCircuit::wrap_mut_1(cb)).0
+    try_from_fn(NeverShortCircuit::wrap_mut_1(f)).0
 }
 
 /// Creates an array `[T; N]` where each fallible array element `T` is returned by the `cb` call.
diff --git a/library/core/src/macros/mod.rs b/library/core/src/macros/mod.rs
index 5f200b31d1ae7..5571d8ad594a1 100644
--- a/library/core/src/macros/mod.rs
+++ b/library/core/src/macros/mod.rs
@@ -237,9 +237,10 @@ pub macro assert_matches {
 /// ```
 #[unstable(feature = "cfg_match", issue = "115585")]
 #[rustc_diagnostic_item = "cfg_match"]
+#[rustc_macro_transparency = "semitransparent"]
 pub macro cfg_match {
     ({ $($tt:tt)* }) => {{
-        cfg_match! { $($tt)* }
+        $crate::cfg_match! { $($tt)* }
     }},
     (_ => { $($output:tt)* }) => {
         $($output)*
@@ -249,10 +250,10 @@ pub macro cfg_match {
         $($( $rest:tt )+)?
     ) => {
         #[cfg($cfg)]
-        cfg_match! { _ => $output }
+        $crate::cfg_match! { _ => $output }
         $(
             #[cfg(not($cfg))]
-            cfg_match! { $($rest)+ }
+            $crate::cfg_match! { $($rest)+ }
         )?
     },
 }
diff --git a/library/core/src/ptr/mod.rs b/library/core/src/ptr/mod.rs
index ea53da78d3bd2..2357ba23aa0d2 100644
--- a/library/core/src/ptr/mod.rs
+++ b/library/core/src/ptr/mod.rs
@@ -398,6 +398,7 @@ use crate::cmp::Ordering;
 use crate::intrinsics::const_eval_select;
 use crate::marker::FnPtr;
 use crate::mem::{self, MaybeUninit, SizedTypeProperties};
+use crate::num::NonZero;
 use crate::{fmt, hash, intrinsics, ub_checks};
 
 mod alignment;
@@ -1094,51 +1095,25 @@ pub const unsafe fn swap_nonoverlapping<T>(x: *mut T, y: *mut T, count: usize) {
             // are pointers inside `T` we will copy them in one go rather than trying to copy a part
             // of a pointer (which would not work).
             // SAFETY: Same preconditions as this function
-            unsafe { swap_nonoverlapping_simple_untyped(x, y, count) }
+            unsafe { swap_nonoverlapping_const(x, y, count) }
         } else {
-            macro_rules! attempt_swap_as_chunks {
-                ($ChunkTy:ty) => {
-                    if align_of::<T>() >= align_of::<$ChunkTy>()
-                        && size_of::<T>() % size_of::<$ChunkTy>() == 0
-                    {
-                        let x: *mut $ChunkTy = x.cast();
-                        let y: *mut $ChunkTy = y.cast();
-                        let count = count * (size_of::<T>() / size_of::<$ChunkTy>());
-                        // SAFETY: these are the same bytes that the caller promised were
-                        // ok, just typed as `MaybeUninit<ChunkTy>`s instead of as `T`s.
-                        // The `if` condition above ensures that we're not violating
-                        // alignment requirements, and that the division is exact so
-                        // that we don't lose any bytes off the end.
-                        return unsafe { swap_nonoverlapping_simple_untyped(x, y, count) };
-                    }
-                };
+            // Going though a slice here helps codegen know the size fits in `isize`
+            let slice = slice_from_raw_parts_mut(x, count);
+            // SAFETY: This is all readable from the pointer, meaning it's one
+            // allocated object, and thus cannot be more than isize::MAX bytes.
+            let bytes = unsafe { mem::size_of_val_raw::<[T]>(slice) };
+            if let Some(bytes) = NonZero::new(bytes) {
+                // SAFETY: These are the same ranges, just expressed in a different
+                // type, so they're still non-overlapping.
+                unsafe { swap_nonoverlapping_bytes(x.cast(), y.cast(), bytes) };
             }
-
-            // Split up the slice into small power-of-two-sized chunks that LLVM is able
-            // to vectorize (unless it's a special type with more-than-pointer alignment,
-            // because we don't want to pessimize things like slices of SIMD vectors.)
-            if align_of::<T>() <= size_of::<usize>()
-            && (!size_of::<T>().is_power_of_two()
-                || size_of::<T>() > size_of::<usize>() * 2)
-            {
-                attempt_swap_as_chunks!(usize);
-                attempt_swap_as_chunks!(u8);
-            }
-
-            // SAFETY: Same preconditions as this function
-            unsafe { swap_nonoverlapping_simple_untyped(x, y, count) }
         }
     )
 }
 
 /// Same behavior and safety conditions as [`swap_nonoverlapping`]
-///
-/// LLVM can vectorize this (at least it can for the power-of-two-sized types
-/// `swap_nonoverlapping` tries to use) so no need to manually SIMD it.
 #[inline]
-const unsafe fn swap_nonoverlapping_simple_untyped<T>(x: *mut T, y: *mut T, count: usize) {
-    let x = x.cast::<MaybeUninit<T>>();
-    let y = y.cast::<MaybeUninit<T>>();
+const unsafe fn swap_nonoverlapping_const<T>(x: *mut T, y: *mut T, count: usize) {
     let mut i = 0;
     while i < count {
         // SAFETY: By precondition, `i` is in-bounds because it's below `n`
@@ -1147,26 +1122,91 @@ const unsafe fn swap_nonoverlapping_simple_untyped<T>(x: *mut T, y: *mut T, coun
         // and it's distinct from `x` since the ranges are non-overlapping
         let y = unsafe { y.add(i) };
 
-        // If we end up here, it's because we're using a simple type -- like
-        // a small power-of-two-sized thing -- or a special type with particularly
-        // large alignment, particularly SIMD types.
-        // Thus, we're fine just reading-and-writing it, as either it's small
-        // and that works well anyway or it's special and the type's author
-        // presumably wanted things to be done in the larger chunk.
-
         // SAFETY: we're only ever given pointers that are valid to read/write,
         // including being aligned, and nothing here panics so it's drop-safe.
         unsafe {
-            let a: MaybeUninit<T> = read(x);
-            let b: MaybeUninit<T> = read(y);
-            write(x, b);
-            write(y, a);
+            // Note that it's critical that these use `copy_nonoverlapping`,
+            // rather than `read`/`write`, to avoid #134713 if T has padding.
+            let mut temp = MaybeUninit::<T>::uninit();
+            copy_nonoverlapping(x, temp.as_mut_ptr(), 1);
+            copy_nonoverlapping(y, x, 1);
+            copy_nonoverlapping(temp.as_ptr(), y, 1);
         }
 
         i += 1;
     }
 }
 
+// Don't let MIR inline this, because we really want it to keep its noalias metadata
+#[rustc_no_mir_inline]
+#[inline]
+fn swap_chunk<const N: usize>(x: &mut MaybeUninit<[u8; N]>, y: &mut MaybeUninit<[u8; N]>) {
+    let a = *x;
+    let b = *y;
+    *x = b;
+    *y = a;
+}
+
+#[inline]
+unsafe fn swap_nonoverlapping_bytes(x: *mut u8, y: *mut u8, bytes: NonZero<usize>) {
+    // Same as `swap_nonoverlapping::<[u8; N]>`.
+    unsafe fn swap_nonoverlapping_chunks<const N: usize>(
+        x: *mut MaybeUninit<[u8; N]>,
+        y: *mut MaybeUninit<[u8; N]>,
+        chunks: NonZero<usize>,
+    ) {
+        let chunks = chunks.get();
+        for i in 0..chunks {
+            // SAFETY: i is in [0, chunks) so the adds and dereferences are in-bounds.
+            unsafe { swap_chunk(&mut *x.add(i), &mut *y.add(i)) };
+        }
+    }
+
+    // Same as `swap_nonoverlapping_bytes`, but accepts at most 1+2+4=7 bytes
+    #[inline]
+    unsafe fn swap_nonoverlapping_short(x: *mut u8, y: *mut u8, bytes: NonZero<usize>) {
+        // Tail handling for auto-vectorized code sometimes has element-at-a-time behaviour,
+        // see <https://github.com/rust-lang/rust/issues/134946>.
+        // By swapping as different sizes, rather than as a loop over bytes,
+        // we make sure not to end up with, say, seven byte-at-a-time copies.
+
+        let bytes = bytes.get();
+        let mut i = 0;
+        macro_rules! swap_prefix {
+            ($($n:literal)+) => {$(
+                if (bytes & $n) != 0 {
+                    // SAFETY: `i` can only have the same bits set as those in bytes,
+                    // so these `add`s are in-bounds of `bytes`.  But the bit for
+                    // `$n` hasn't been set yet, so the `$n` bytes that `swap_chunk`
+                    // will read and write are within the usable range.
+                    unsafe { swap_chunk::<$n>(&mut*x.add(i).cast(), &mut*y.add(i).cast()) };
+                    i |= $n;
+                }
+            )+};
+        }
+        swap_prefix!(4 2 1);
+        debug_assert_eq!(i, bytes);
+    }
+
+    const CHUNK_SIZE: usize = size_of::<*const ()>();
+    let bytes = bytes.get();
+
+    let chunks = bytes / CHUNK_SIZE;
+    let tail = bytes % CHUNK_SIZE;
+    if let Some(chunks) = NonZero::new(chunks) {
+        // SAFETY: this is bytes/CHUNK_SIZE*CHUNK_SIZE bytes, which is <= bytes,
+        // so it's within the range of our non-overlapping bytes.
+        unsafe { swap_nonoverlapping_chunks::<CHUNK_SIZE>(x.cast(), y.cast(), chunks) };
+    }
+    if let Some(tail) = NonZero::new(tail) {
+        const { assert!(CHUNK_SIZE <= 8) };
+        let delta = chunks * CHUNK_SIZE;
+        // SAFETY: the tail length is below CHUNK SIZE because of the remainder,
+        // and CHUNK_SIZE is at most 8 by the const assert, so tail <= 7
+        unsafe { swap_nonoverlapping_short(x.add(delta), y.add(delta), tail) };
+    }
+}
+
 /// Moves `src` into the pointed `dst`, returning the previous `dst` value.
 ///
 /// Neither value is dropped.
diff --git a/library/coretests/tests/ptr.rs b/library/coretests/tests/ptr.rs
index 6091926084a35..cc5f7946863a6 100644
--- a/library/coretests/tests/ptr.rs
+++ b/library/coretests/tests/ptr.rs
@@ -984,3 +984,39 @@ fn test_ptr_metadata_in_const() {
     assert_eq!(SLICE_META, 3);
     assert_eq!(DYN_META.size_of(), 42);
 }
+
+// See <https://github.com/rust-lang/rust/issues/134713>
+const fn ptr_swap_nonoverlapping_is_untyped_inner() {
+    #[repr(C)]
+    struct HasPadding(usize, u8);
+
+    let buf1: [usize; 2] = [1000, 2000];
+    let buf2: [usize; 2] = [3000, 4000];
+
+    // HasPadding and [usize; 2] have the same size and alignment,
+    // so swap_nonoverlapping should treat them the same
+    assert!(size_of::<HasPadding>() == size_of::<[usize; 2]>());
+    assert!(align_of::<HasPadding>() == align_of::<[usize; 2]>());
+
+    let mut b1 = buf1;
+    let mut b2 = buf2;
+    // Safety: b1 and b2 are distinct local variables,
+    // with the same size and alignment as HasPadding.
+    unsafe {
+        std::ptr::swap_nonoverlapping(
+            b1.as_mut_ptr().cast::<HasPadding>(),
+            b2.as_mut_ptr().cast::<HasPadding>(),
+            1,
+        );
+    }
+    assert!(b1[0] == buf2[0]);
+    assert!(b1[1] == buf2[1]);
+    assert!(b2[0] == buf1[0]);
+    assert!(b2[1] == buf1[1]);
+}
+
+#[test]
+fn test_ptr_swap_nonoverlapping_is_untyped() {
+    ptr_swap_nonoverlapping_is_untyped_inner();
+    const { ptr_swap_nonoverlapping_is_untyped_inner() };
+}
diff --git a/library/std/src/sys/path/windows.rs b/library/std/src/sys/path/windows.rs
index 1c53472191699..6547ed9aa5f32 100644
--- a/library/std/src/sys/path/windows.rs
+++ b/library/std/src/sys/path/windows.rs
@@ -350,3 +350,46 @@ pub(crate) fn absolute(path: &Path) -> io::Result<PathBuf> {
 pub(crate) fn is_absolute(path: &Path) -> bool {
     path.has_root() && path.prefix().is_some()
 }
+
+/// Test that the path is absolute, fully qualified and unchanged when processed by the Windows API.
+///
+/// For example:
+///
+/// - `C:\path\to\file` will return true.
+/// - `C:\path\to\nul` returns false because the Windows API will convert it to \\.\NUL
+/// - `C:\path\to\..\file` returns false because it will be resolved to `C:\path\file`.
+///
+/// This is a useful property because it means the path can be converted from and to and verbatim
+/// path just by changing the prefix.
+pub(crate) fn is_absolute_exact(path: &[u16]) -> bool {
+    // This is implemented by checking that passing the path through
+    // GetFullPathNameW does not change the path in any way.
+
+    // Windows paths are limited to i16::MAX length
+    // though the API here accepts a u32 for the length.
+    if path.is_empty() || path.len() > u32::MAX as usize || path.last() != Some(&0) {
+        return false;
+    }
+    // The path returned by `GetFullPathNameW` must be the same length as the
+    // given path, otherwise they're not equal.
+    let buffer_len = path.len();
+    let mut new_path = Vec::with_capacity(buffer_len);
+    let result = unsafe {
+        c::GetFullPathNameW(
+            path.as_ptr(),
+            new_path.capacity() as u32,
+            new_path.as_mut_ptr(),
+            crate::ptr::null_mut(),
+        )
+    };
+    // Note: if non-zero, the returned result is the length of the buffer without the null termination
+    if result == 0 || result as usize != buffer_len - 1 {
+        false
+    } else {
+        // SAFETY: `GetFullPathNameW` initialized `result` bytes and does not exceed `nBufferLength - 1` (capacity).
+        unsafe {
+            new_path.set_len((result as usize) + 1);
+        }
+        path == &new_path
+    }
+}
diff --git a/library/std/src/sys/path/windows/tests.rs b/library/std/src/sys/path/windows/tests.rs
index f2a60e30bc610..9eb79203dcac7 100644
--- a/library/std/src/sys/path/windows/tests.rs
+++ b/library/std/src/sys/path/windows/tests.rs
@@ -135,3 +135,15 @@ fn broken_unc_path() {
     assert_eq!(components.next(), Some(Component::Normal("foo".as_ref())));
     assert_eq!(components.next(), Some(Component::Normal("bar".as_ref())));
 }
+
+#[test]
+fn test_is_absolute_exact() {
+    use crate::sys::pal::api::wide_str;
+    // These paths can be made verbatim by only changing their prefix.
+    assert!(is_absolute_exact(wide_str!(r"C:\path\to\file")));
+    assert!(is_absolute_exact(wide_str!(r"\\server\share\path\to\file")));
+    // These paths change more substantially
+    assert!(!is_absolute_exact(wide_str!(r"C:\path\to\..\file")));
+    assert!(!is_absolute_exact(wide_str!(r"\\server\share\path\to\..\file")));
+    assert!(!is_absolute_exact(wide_str!(r"C:\path\to\NUL"))); // Converts to \\.\NUL
+}
diff --git a/library/std/src/sys/process/windows.rs b/library/std/src/sys/process/windows.rs
index 06c15e08f3fb1..4cfdf908c58de 100644
--- a/library/std/src/sys/process/windows.rs
+++ b/library/std/src/sys/process/windows.rs
@@ -19,7 +19,7 @@ use crate::sys::args::{self, Arg};
 use crate::sys::c::{self, EXIT_FAILURE, EXIT_SUCCESS};
 use crate::sys::fs::{File, OpenOptions};
 use crate::sys::handle::Handle;
-use crate::sys::pal::api::{self, WinError};
+use crate::sys::pal::api::{self, WinError, utf16};
 use crate::sys::pal::{ensure_no_nuls, fill_utf16_buf};
 use crate::sys::pipe::{self, AnonPipe};
 use crate::sys::{cvt, path, stdio};
@@ -880,9 +880,33 @@ fn make_envp(maybe_env: Option<BTreeMap<EnvKey, OsString>>) -> io::Result<(*mut
 fn make_dirp(d: Option<&OsString>) -> io::Result<(*const u16, Vec<u16>)> {
     match d {
         Some(dir) => {
-            let mut dir_str: Vec<u16> = ensure_no_nuls(dir)?.encode_wide().collect();
-            dir_str.push(0);
-            Ok((dir_str.as_ptr(), dir_str))
+            let mut dir_str: Vec<u16> = ensure_no_nuls(dir)?.encode_wide().chain([0]).collect();
+            // Try to remove the `\\?\` prefix, if any.
+            // This is necessary because the current directory does not support verbatim paths.
+            // However. this can only be done if it doesn't change how the path will be resolved.
+            let ptr = if dir_str.starts_with(utf16!(r"\\?\UNC")) {
+                // Turn the `C` in `UNC` into a `\` so we can then use `\\rest\of\path`.
+                let start = r"\\?\UN".len();
+                dir_str[start] = b'\\' as u16;
+                if path::is_absolute_exact(&dir_str[start..]) {
+                    dir_str[start..].as_ptr()
+                } else {
+                    // Revert the above change.
+                    dir_str[start] = b'C' as u16;
+                    dir_str.as_ptr()
+                }
+            } else if dir_str.starts_with(utf16!(r"\\?\")) {
+                // Strip the leading `\\?\`
+                let start = r"\\?\".len();
+                if path::is_absolute_exact(&dir_str[start..]) {
+                    dir_str[start..].as_ptr()
+                } else {
+                    dir_str.as_ptr()
+                }
+            } else {
+                dir_str.as_ptr()
+            };
+            Ok((ptr, dir_str))
         }
         None => Ok((ptr::null(), Vec::new())),
     }
diff --git a/src/ci/citool/src/analysis.rs b/src/ci/citool/src/analysis.rs
index 7fbfad467c641..208a494183c02 100644
--- a/src/ci/citool/src/analysis.rs
+++ b/src/ci/citool/src/analysis.rs
@@ -7,6 +7,7 @@ use build_helper::metrics::{
     format_build_steps,
 };
 
+use crate::github::JobInfoResolver;
 use crate::metrics;
 use crate::metrics::{JobMetrics, JobName, get_test_suites};
 use crate::utils::{output_details, pluralize};
@@ -185,13 +186,19 @@ fn render_table(suites: BTreeMap<String, TestSuiteRecord>) -> String {
 }
 
 /// Outputs a report of test differences between the `parent` and `current` commits.
-pub fn output_test_diffs(job_metrics: &HashMap<JobName, JobMetrics>) {
+pub fn output_test_diffs(
+    job_metrics: &HashMap<JobName, JobMetrics>,
+    job_info_resolver: &mut JobInfoResolver,
+) {
     let aggregated_test_diffs = aggregate_test_diffs(&job_metrics);
-    report_test_diffs(aggregated_test_diffs);
+    report_test_diffs(aggregated_test_diffs, job_metrics, job_info_resolver);
 }
 
 /// Prints the ten largest differences in bootstrap durations.
-pub fn output_largest_duration_changes(job_metrics: &HashMap<JobName, JobMetrics>) {
+pub fn output_largest_duration_changes(
+    job_metrics: &HashMap<JobName, JobMetrics>,
+    job_info_resolver: &mut JobInfoResolver,
+) {
     struct Entry<'a> {
         job: &'a JobName,
         before: Duration,
@@ -225,14 +232,14 @@ pub fn output_largest_duration_changes(job_metrics: &HashMap<JobName, JobMetrics
             });
         }
     }
-    changes.sort_by(|e1, e2| e1.change.partial_cmp(&e2.change).unwrap().reverse());
+    changes.sort_by(|e1, e2| e1.change.abs().partial_cmp(&e2.change.abs()).unwrap().reverse());
 
     println!("# Job duration changes");
     for (index, entry) in changes.into_iter().take(10).enumerate() {
         println!(
-            "{}. `{}`: {:.1}s -> {:.1}s ({:.1}%)",
+            "{}. {}: {:.1}s -> {:.1}s ({:.1}%)",
             index + 1,
-            entry.job,
+            format_job_link(job_info_resolver, job_metrics, entry.job),
             entry.before.as_secs_f64(),
             entry.after.as_secs_f64(),
             entry.change
@@ -400,7 +407,11 @@ fn generate_test_name(name: &str) -> String {
 }
 
 /// Prints test changes in Markdown format to stdout.
-fn report_test_diffs(diff: AggregatedTestDiffs) {
+fn report_test_diffs(
+    diff: AggregatedTestDiffs,
+    job_metrics: &HashMap<JobName, JobMetrics>,
+    job_info_resolver: &mut JobInfoResolver,
+) {
     println!("# Test differences");
     if diff.diffs.is_empty() {
         println!("No test diffs found");
@@ -521,9 +532,26 @@ fn report_test_diffs(diff: AggregatedTestDiffs) {
                 println!(
                     "- {}: {}",
                     format_job_group(group as u64),
-                    jobs.iter().map(|j| format!("`{j}`")).collect::<Vec<_>>().join(", ")
+                    jobs.iter()
+                        .map(|j| format_job_link(job_info_resolver, job_metrics, j))
+                        .collect::<Vec<_>>()
+                        .join(", ")
                 );
             }
         },
     );
 }
+
+/// Tries to get a GitHub Actions job summary URL from the resolver.
+/// If it is not available, just wraps the job name in backticks.
+fn format_job_link(
+    job_info_resolver: &mut JobInfoResolver,
+    job_metrics: &HashMap<JobName, JobMetrics>,
+    job_name: &str,
+) -> String {
+    job_metrics
+        .get(job_name)
+        .and_then(|metrics| job_info_resolver.get_job_summary_link(job_name, &metrics.current))
+        .map(|summary_url| format!("[{job_name}]({summary_url})"))
+        .unwrap_or_else(|| format!("`{job_name}`"))
+}
diff --git a/src/ci/citool/src/github.rs b/src/ci/citool/src/github.rs
new file mode 100644
index 0000000000000..35e4c3f9599d6
--- /dev/null
+++ b/src/ci/citool/src/github.rs
@@ -0,0 +1,109 @@
+use std::collections::HashMap;
+
+use anyhow::Context;
+use build_helper::metrics::{CiMetadata, JsonRoot};
+
+pub struct GitHubClient;
+
+impl GitHubClient {
+    fn get_workflow_run_jobs(
+        &self,
+        repo: &str,
+        workflow_run_id: u64,
+    ) -> anyhow::Result<Vec<GitHubJob>> {
+        let req = ureq::get(format!(
+            "https://api.github.com/repos/{repo}/actions/runs/{workflow_run_id}/jobs?per_page=100"
+        ))
+        .header("User-Agent", "rust-lang/rust/citool")
+        .header("Accept", "application/vnd.github+json")
+        .header("X-GitHub-Api-Version", "2022-11-28")
+        .call()
+        .context("cannot get workflow job list")?;
+
+        let status = req.status();
+        let mut body = req.into_body();
+        if status.is_success() {
+            // This API response is actually paged, but we assume for now that there are at
+            // most 100 jobs per workflow.
+            let response = body
+                .read_json::<WorkflowRunJobsResponse>()
+                .context("cannot deserialize workflow run jobs response")?;
+            // The CI job names have a prefix, e.g. `auto - foo`. We remove the prefix here to
+            // normalize the job name.
+            Ok(response
+                .jobs
+                .into_iter()
+                .map(|mut job| {
+                    job.name = job
+                        .name
+                        .split_once(" - ")
+                        .map(|res| res.1.to_string())
+                        .unwrap_or_else(|| job.name);
+                    job
+                })
+                .collect())
+        } else {
+            Err(anyhow::anyhow!(
+                "Cannot get jobs of workflow run {workflow_run_id}: {status}\n{}",
+                body.read_to_string()?
+            ))
+        }
+    }
+}
+
+#[derive(serde::Deserialize)]
+struct WorkflowRunJobsResponse {
+    jobs: Vec<GitHubJob>,
+}
+
+#[derive(serde::Deserialize)]
+struct GitHubJob {
+    name: String,
+    id: u64,
+}
+
+/// Can be used to resolve information about GitHub Actions jobs.
+/// Caches results internally to avoid too unnecessary GitHub API calls.
+pub struct JobInfoResolver {
+    client: GitHubClient,
+    // Workflow run ID -> jobs
+    workflow_job_cache: HashMap<u64, Vec<GitHubJob>>,
+}
+
+impl JobInfoResolver {
+    pub fn new() -> Self {
+        Self { client: GitHubClient, workflow_job_cache: Default::default() }
+    }
+
+    /// Get a link to a job summary for the given job name and bootstrap execution.
+    pub fn get_job_summary_link(&mut self, job_name: &str, metrics: &JsonRoot) -> Option<String> {
+        metrics.ci_metadata.as_ref().and_then(|metadata| {
+            self.get_job_id(metadata, job_name).map(|job_id| {
+                format!(
+                    "https://github.com/{}/actions/runs/{}#summary-{job_id}",
+                    metadata.repository, metadata.workflow_run_id
+                )
+            })
+        })
+    }
+
+    fn get_job_id(&mut self, ci_metadata: &CiMetadata, job_name: &str) -> Option<u64> {
+        if let Some(job) = self
+            .workflow_job_cache
+            .get(&ci_metadata.workflow_run_id)
+            .and_then(|jobs| jobs.iter().find(|j| j.name == job_name))
+        {
+            return Some(job.id);
+        }
+
+        let jobs = self
+            .client
+            .get_workflow_run_jobs(&ci_metadata.repository, ci_metadata.workflow_run_id)
+            .inspect_err(|e| eprintln!("Cannot download workflow jobs: {e:?}"))
+            .ok()?;
+        let job_id = jobs.iter().find(|j| j.name == job_name).map(|j| j.id);
+        // Save the cache even if the job name was not found, it could be useful for further lookups
+        self.workflow_job_cache.insert(ci_metadata.workflow_run_id, jobs);
+        job_id
+    }
+}
diff --git a/src/ci/citool/src/main.rs b/src/ci/citool/src/main.rs
index 6db5eab458cca..a1956da352f5c 100644
--- a/src/ci/citool/src/main.rs
+++ b/src/ci/citool/src/main.rs
@@ -1,6 +1,7 @@
 mod analysis;
 mod cpu_usage;
 mod datadog;
+mod github;
 mod jobs;
 mod metrics;
 mod utils;
@@ -18,6 +19,7 @@ use serde_yaml::Value;
 use crate::analysis::{output_largest_duration_changes, output_test_diffs};
 use crate::cpu_usage::load_cpu_usage;
 use crate::datadog::upload_datadog_metric;
+use crate::github::JobInfoResolver;
 use crate::jobs::RunType;
 use crate::metrics::{JobMetrics, download_auto_job_metrics, download_job_metrics, load_metrics};
 use crate::utils::load_env_var;
@@ -145,6 +147,7 @@ fn postprocess_metrics(
 ) -> anyhow::Result<()> {
     let metrics = load_metrics(&metrics_path)?;
 
+    let mut job_info_resolver = JobInfoResolver::new();
     if let (Some(parent), Some(job_name)) = (parent, job_name) {
         // This command is executed also on PR builds, which might not have parent metrics
         // available, because some PR jobs don't run on auto builds, and PR jobs do not upload metrics
@@ -160,7 +163,7 @@ fn postprocess_metrics(
                     job_name,
                     JobMetrics { parent: Some(parent_metrics), current: metrics },
                 )]);
-                output_test_diffs(&job_metrics);
+                output_test_diffs(&job_metrics, &mut job_info_resolver);
                 return Ok(());
             }
             Err(error) => {
@@ -180,8 +183,10 @@ fn post_merge_report(db: JobDatabase, current: String, parent: String) -> anyhow
     let metrics = download_auto_job_metrics(&db, &parent, &current)?;
 
     println!("\nComparing {parent} (parent) -> {current} (this PR)\n");
-    output_test_diffs(&metrics);
-    output_largest_duration_changes(&metrics);
+
+    let mut job_info_resolver = JobInfoResolver::new();
+    output_test_diffs(&metrics, &mut job_info_resolver);
+    output_largest_duration_changes(&metrics, &mut job_info_resolver);
 
     Ok(())
 }
diff --git a/src/tools/miri/tests/pass/issues/issue-134713-swap_nonoverlapping_untyped.rs b/src/tools/miri/tests/pass/issues/issue-134713-swap_nonoverlapping_untyped.rs
new file mode 100644
index 0000000000000..a1da60ce65c4d
--- /dev/null
+++ b/src/tools/miri/tests/pass/issues/issue-134713-swap_nonoverlapping_untyped.rs
@@ -0,0 +1,30 @@
+use std::mem::{size_of, align_of};
+
+// See <https://github.com/rust-lang/rust/issues/134713>
+
+#[repr(C)]
+struct Foo(usize, u8);
+
+fn main() {
+    let buf1: [usize; 2] = [1000, 2000];
+    let buf2: [usize; 2] = [3000, 4000];
+
+    // Foo and [usize; 2] have the same size and alignment,
+    // so swap_nonoverlapping should treat them the same
+    assert_eq!(size_of::<Foo>(), size_of::<[usize; 2]>());
+    assert_eq!(align_of::<Foo>(), align_of::<[usize; 2]>());
+
+    let mut b1 = buf1;
+    let mut b2 = buf2;
+    // Safety: b1 and b2 are distinct local variables,
+    // with the same size and alignment as Foo.
+    unsafe {
+        std::ptr::swap_nonoverlapping(
+            b1.as_mut_ptr().cast::<Foo>(),
+            b2.as_mut_ptr().cast::<Foo>(),
+            1,
+        );
+    }
+    assert_eq!(b1, buf2);
+    assert_eq!(b2, buf1);
+}
diff --git a/tests/assembly/x86_64-typed-swap.rs b/tests/assembly/x86_64-typed-swap.rs
index dfd6ee565bccb..a6753011d3620 100644
--- a/tests/assembly/x86_64-typed-swap.rs
+++ b/tests/assembly/x86_64-typed-swap.rs
@@ -51,3 +51,31 @@ pub fn swap_simd(x: &mut __m128, y: &mut __m128) {
     // CHECK-NEXT: retq
     swap(x, y)
 }
+
+// CHECK-LABEL: swap_string:
+#[no_mangle]
+pub fn swap_string(x: &mut String, y: &mut String) {
+    // CHECK-NOT: mov
+    // CHECK-COUNT-4: movups
+    // CHECK-NOT: mov
+    // CHECK-COUNT-4: movq
+    // CHECK-NOT: mov
+    swap(x, y)
+}
+
+// CHECK-LABEL: swap_44_bytes:
+#[no_mangle]
+pub fn swap_44_bytes(x: &mut [u8; 44], y: &mut [u8; 44]) {
+    // Ensure we do better than a long run of byte copies,
+    // see <https://github.com/rust-lang/rust/issues/134946>
+
+    // CHECK-NOT: movb
+    // CHECK-COUNT-8: movups{{.+}}xmm
+    // CHECK-NOT: movb
+    // CHECK-COUNT-4: movq
+    // CHECK-NOT: movb
+    // CHECK-COUNT-4: movl
+    // CHECK-NOT: movb
+    // CHECK: retq
+    swap(x, y)
+}
diff --git a/tests/codegen/simd/swap-simd-types.rs b/tests/codegen/simd/swap-simd-types.rs
index 69767d0a75580..c063cc683a616 100644
--- a/tests/codegen/simd/swap-simd-types.rs
+++ b/tests/codegen/simd/swap-simd-types.rs
@@ -23,8 +23,8 @@ pub fn swap_single_m256(x: &mut __m256, y: &mut __m256) {
 #[no_mangle]
 pub fn swap_m256_slice(x: &mut [__m256], y: &mut [__m256]) {
     // CHECK-NOT: alloca
-    // CHECK: load <8 x float>{{.+}}align 32
-    // CHECK: store <8 x float>{{.+}}align 32
+    // CHECK-COUNT-2: load <4 x i64>{{.+}}align 32
+    // CHECK-COUNT-2: store <4 x i64>{{.+}}align 32
     if x.len() == y.len() {
         x.swap_with_slice(y);
     }
@@ -34,7 +34,7 @@ pub fn swap_m256_slice(x: &mut [__m256], y: &mut [__m256]) {
 #[no_mangle]
 pub fn swap_bytes32(x: &mut [u8; 32], y: &mut [u8; 32]) {
     // CHECK-NOT: alloca
-    // CHECK: load <32 x i8>{{.+}}align 1
-    // CHECK: store <32 x i8>{{.+}}align 1
+    // CHECK-COUNT-2: load <4 x i64>{{.+}}align 1
+    // CHECK-COUNT-2: store <4 x i64>{{.+}}align 1
     swap(x, y)
 }
diff --git a/tests/codegen/swap-large-types.rs b/tests/codegen/swap-large-types.rs
index 49a41bb14692f..08c486affd949 100644
--- a/tests/codegen/swap-large-types.rs
+++ b/tests/codegen/swap-large-types.rs
@@ -12,6 +12,16 @@ type KeccakBuffer = [[u64; 5]; 5];
 // to stack for large types, which is completely unnecessary as the lack of
 // overlap means we can just do whatever fits in registers at a time.
 
+// The tests here (after the first one showing that the problem still exists)
+// are less about testing *exactly* what the codegen is, and more about testing
+// 1) That things are swapped directly from one argument to the other,
+//    never going through stack along the way, and
+// 2) That we're doing the swapping for big things using large vector types,
+//    rather then `i64` or `<8 x i8>` (or, even worse, `i8`) at a time.
+//
+// (There are separate tests for intrinsics::typed_swap_nonoverlapping that
+//  check that it, as an intrinsic, are emitting exactly what it should.)
+
 // CHECK-LABEL: @swap_basic
 #[no_mangle]
 pub fn swap_basic(x: &mut KeccakBuffer, y: &mut KeccakBuffer) {
@@ -26,55 +36,55 @@ pub fn swap_basic(x: &mut KeccakBuffer, y: &mut KeccakBuffer) {
     }
 }
 
-// This test verifies that the library does something smarter, and thus
-// doesn't need any scratch space on the stack.
-
 // CHECK-LABEL: @swap_std
 #[no_mangle]
 pub fn swap_std(x: &mut KeccakBuffer, y: &mut KeccakBuffer) {
     // CHECK-NOT: alloca
-    // CHECK: load <{{[0-9]+}} x i64>
-    // CHECK: store <{{[0-9]+}} x i64>
+    // CHECK: load <{{2|4}} x i64>
+    // CHECK: store <{{2|4}} x i64>
     swap(x, y)
 }
 
-// Verify that types with usize alignment are swapped via vectored usizes,
-// not falling back to byte-level code.
-
 // CHECK-LABEL: @swap_slice
 #[no_mangle]
 pub fn swap_slice(x: &mut [KeccakBuffer], y: &mut [KeccakBuffer]) {
     // CHECK-NOT: alloca
-    // CHECK: load <{{[0-9]+}} x i64>
-    // CHECK: store <{{[0-9]+}} x i64>
+    // CHECK: load <{{2|4}} x i64>
+    // CHECK: store <{{2|4}} x i64>
     if x.len() == y.len() {
         x.swap_with_slice(y);
     }
 }
 
-// But for a large align-1 type, vectorized byte copying is what we want.
-
 type OneKilobyteBuffer = [u8; 1024];
 
 // CHECK-LABEL: @swap_1kb_slices
 #[no_mangle]
 pub fn swap_1kb_slices(x: &mut [OneKilobyteBuffer], y: &mut [OneKilobyteBuffer]) {
     // CHECK-NOT: alloca
-    // CHECK: load <{{[0-9]+}} x i8>
-    // CHECK: store <{{[0-9]+}} x i8>
+
+    // CHECK-NOT: load i32
+    // CHECK-NOT: store i32
+    // CHECK-NOT: load i16
+    // CHECK-NOT: store i16
+    // CHECK-NOT: load i8
+    // CHECK-NOT: store i8
+
+    // CHECK: load <{{2|4}} x i64>{{.+}}align 1,
+    // CHECK: store <{{2|4}} x i64>{{.+}}align 1,
+
+    // CHECK-NOT: load i32
+    // CHECK-NOT: store i32
+    // CHECK-NOT: load i16
+    // CHECK-NOT: store i16
+    // CHECK-NOT: load i8
+    // CHECK-NOT: store i8
+
     if x.len() == y.len() {
         x.swap_with_slice(y);
     }
 }
 
-// This verifies that the 2×read + 2×write optimizes to just 3 memcpys
-// for an unusual type like this.  It's not clear whether we should do anything
-// smarter in Rust for these, so for now it's fine to leave these up to the backend.
-// That's not as bad as it might seem, as for example, LLVM will lower the
-// memcpys below to VMOVAPS on YMMs if one enables the AVX target feature.
-// Eventually we'll be able to pass `align_of::<T>` to a const generic and
-// thus pick a smarter chunk size ourselves without huge code duplication.
-
 #[repr(align(64))]
 pub struct BigButHighlyAligned([u8; 64 * 3]);
 
@@ -82,9 +92,25 @@ pub struct BigButHighlyAligned([u8; 64 * 3]);
 #[no_mangle]
 pub fn swap_big_aligned(x: &mut BigButHighlyAligned, y: &mut BigButHighlyAligned) {
     // CHECK-NOT: call void @llvm.memcpy
-    // CHECK: call void @llvm.memcpy.{{.+}}(ptr noundef nonnull align 64 dereferenceable(192)
-    // CHECK: call void @llvm.memcpy.{{.+}}(ptr noundef nonnull align 64 dereferenceable(192)
-    // CHECK: call void @llvm.memcpy.{{.+}}(ptr noundef nonnull align 64 dereferenceable(192)
+    // CHECK-NOT: load i32
+    // CHECK-NOT: store i32
+    // CHECK-NOT: load i16
+    // CHECK-NOT: store i16
+    // CHECK-NOT: load i8
+    // CHECK-NOT: store i8
+
+    // CHECK-COUNT-2: load <{{2|4}} x i64>{{.+}}align 64,
+    // CHECK-COUNT-2: store <{{2|4}} x i64>{{.+}}align 64,
+
+    // CHECK-COUNT-2: load <{{2|4}} x i64>{{.+}}align 32,
+    // CHECK-COUNT-2: store <{{2|4}} x i64>{{.+}}align 32,
+
+    // CHECK-NOT: load i32
+    // CHECK-NOT: store i32
+    // CHECK-NOT: load i16
+    // CHECK-NOT: store i16
+    // CHECK-NOT: load i8
+    // CHECK-NOT: store i8
     // CHECK-NOT: call void @llvm.memcpy
     swap(x, y)
 }
diff --git a/tests/codegen/swap-small-types.rs b/tests/codegen/swap-small-types.rs
index 76bb853e64238..ffa573c9a43ab 100644
--- a/tests/codegen/swap-small-types.rs
+++ b/tests/codegen/swap-small-types.rs
@@ -1,5 +1,6 @@
 //@ compile-flags: -Copt-level=3 -Z merge-functions=disabled
 //@ only-x86_64
+//@ min-llvm-version: 20
 
 #![crate_type = "lib"]
 
@@ -27,13 +28,19 @@ pub fn swap_rgb48_manually(x: &mut RGB48, y: &mut RGB48) {
 pub fn swap_rgb48(x: &mut RGB48, y: &mut RGB48) {
     // CHECK-NOT: alloca
 
-    // Whether `i8` is the best for this is unclear, but
-    // might as well record what's actually happening right now.
-
-    // CHECK: load i8
-    // CHECK: load i8
-    // CHECK: store i8
-    // CHECK: store i8
+    // Swapping `i48` might be cleaner in LLVM-IR here, but `i32`+`i16` isn't bad,
+    // and is closer to the assembly it generates anyway.
+
+    // CHECK-NOT: load{{ }}
+    // CHECK: load i32{{.+}}align 2
+    // CHECK-NEXT: load i32{{.+}}align 2
+    // CHECK-NEXT: store i32{{.+}}align 2
+    // CHECK-NEXT: store i32{{.+}}align 2
+    // CHECK: load i16{{.+}}align 2
+    // CHECK-NEXT: load i16{{.+}}align 2
+    // CHECK-NEXT: store i16{{.+}}align 2
+    // CHECK-NEXT: store i16{{.+}}align 2
+    // CHECK-NOT: store{{ }}
     swap(x, y)
 }
 
@@ -76,30 +83,49 @@ pub fn swap_slices<'a>(x: &mut &'a [u32], y: &mut &'a [u32]) {
     swap(x, y)
 }
 
-// LLVM doesn't vectorize a loop over 3-byte elements,
-// so we chunk it down to bytes and loop over those instead.
 type RGB24 = [u8; 3];
 
 // CHECK-LABEL: @swap_rgb24_slices
 #[no_mangle]
 pub fn swap_rgb24_slices(x: &mut [RGB24], y: &mut [RGB24]) {
     // CHECK-NOT: alloca
-    // CHECK: load <{{[0-9]+}} x i8>
-    // CHECK: store <{{[0-9]+}} x i8>
+
+    // CHECK: mul nuw nsw i64 %{{x|y}}.1, 3
+
+    // CHECK: load <{{[0-9]+}} x i64>
+    // CHECK: store <{{[0-9]+}} x i64>
+
+    // CHECK-COUNT-2: load i32
+    // CHECK-COUNT-2: store i32
+    // CHECK-COUNT-2: load i16
+    // CHECK-COUNT-2: store i16
+    // CHECK-COUNT-2: load i8
+    // CHECK-COUNT-2: store i8
     if x.len() == y.len() {
         x.swap_with_slice(y);
     }
 }
 
-// This one has a power-of-two size, so we iterate over it directly
 type RGBA32 = [u8; 4];
 
 // CHECK-LABEL: @swap_rgba32_slices
 #[no_mangle]
 pub fn swap_rgba32_slices(x: &mut [RGBA32], y: &mut [RGBA32]) {
     // CHECK-NOT: alloca
-    // CHECK: load <{{[0-9]+}} x i32>
-    // CHECK: store <{{[0-9]+}} x i32>
+
+    // Because the size in bytes in a multiple of 4, we can skip the smallest sizes.
+
+    // CHECK: load <{{[0-9]+}} x i64>
+    // CHECK: store <{{[0-9]+}} x i64>
+
+    // CHECK-COUNT-2: load i32
+    // CHECK-COUNT-2: store i32
+
+    // CHECK-NOT: load i16
+    // CHECK-NOT: store i16
+    // CHECK-NOT: load i8
+    // CHECK-NOT: store i8
+
     if x.len() == y.len() {
         x.swap_with_slice(y);
     }
@@ -113,8 +139,8 @@ const _: () = assert!(!std::mem::size_of::<String>().is_power_of_two());
 #[no_mangle]
 pub fn swap_string_slices(x: &mut [String], y: &mut [String]) {
     // CHECK-NOT: alloca
-    // CHECK: load <{{[0-9]+}} x i64>
-    // CHECK: store <{{[0-9]+}} x i64>
+    // CHECK: load <{{[0-9]+}} x i64>{{.+}}, align 8,
+    // CHECK: store <{{[0-9]+}} x i64>{{.+}}, align 8,
     if x.len() == y.len() {
         x.swap_with_slice(y);
     }
@@ -130,6 +156,26 @@ pub struct Packed {
 #[no_mangle]
 pub fn swap_packed_structs(x: &mut Packed, y: &mut Packed) {
     // CHECK-NOT: alloca
+
+    // CHECK-NOT: load
+    // CHECK-NOT: store
+
+    // CHECK: %[[A:.+]] = load i64, ptr %x, align 1,
+    // CHECK-NEXT: %[[B:.+]] = load i64, ptr %y, align 1,
+    // CHECK-NEXT: store i64 %[[B]], ptr %x, align 1,
+    // CHECK-NEXT: store i64 %[[A]], ptr %y, align 1,
+
+    // CHECK-NOT: load
+    // CHECK-NOT: store
+
+    // CHECK: %[[C:.+]] = load i8, ptr %[[X8:.+]], align 1,
+    // CHECK-NEXT: %[[D:.+]] = load i8, ptr %[[Y8:.+]], align 1,
+    // CHECK-NEXT: store i8 %[[D]], ptr %[[X8]], align 1,
+    // CHECK-NEXT: store i8 %[[C]], ptr %[[Y8]], align 1,
+
+    // CHECK-NOT: load
+    // CHECK-NOT: store
+
     // CHECK: ret void
     swap(x, y)
 }
diff --git a/tests/ui/consts/missing_span_in_backtrace.stderr b/tests/ui/consts/missing_span_in_backtrace.stderr
index 2f3a65302bd57..aad3d76dd2697 100644
--- a/tests/ui/consts/missing_span_in_backtrace.stderr
+++ b/tests/ui/consts/missing_span_in_backtrace.stderr
@@ -12,10 +12,10 @@ note: inside `swap_nonoverlapping::<MaybeUninit<u8>>`
   --> $SRC_DIR/core/src/ptr/mod.rs:LL:COL
 note: inside `swap_nonoverlapping::compiletime::<MaybeUninit<u8>>`
   --> $SRC_DIR/core/src/ptr/mod.rs:LL:COL
-note: inside `std::ptr::swap_nonoverlapping_simple_untyped::<MaybeUninit<u8>>`
-  --> $SRC_DIR/core/src/ptr/mod.rs:LL:COL
-note: inside `std::ptr::read::<MaybeUninit<MaybeUninit<u8>>>`
+note: inside `std::ptr::swap_nonoverlapping_const::<MaybeUninit<u8>>`
   --> $SRC_DIR/core/src/ptr/mod.rs:LL:COL
+note: inside `copy_nonoverlapping::<MaybeUninit<u8>>`
+  --> $SRC_DIR/core/src/intrinsics/mod.rs:LL:COL
    = help: this code performed an operation that depends on the underlying bytes representing a pointer
    = help: the absolute address of a pointer is not known at compile-time, so such operations are not supported
    = note: this error originates in the macro `$crate::intrinsics::const_eval_select` which comes from the expansion of the macro `const_eval_select` (in Nightly builds, run with -Z macro-backtrace for more info)
diff --git a/tests/ui/process/win-command-curdir-no-verbatim.rs b/tests/ui/process/win-command-curdir-no-verbatim.rs
new file mode 100644
index 0000000000000..99943e1c8abc2
--- /dev/null
+++ b/tests/ui/process/win-command-curdir-no-verbatim.rs
@@ -0,0 +1,36 @@
+// Test that windows verbatim paths in `Command::current_dir` are converted to
+// non-verbatim paths before passing to the subprocess.
+
+//@ run-pass
+//@ only-windows
+//@ needs-subprocess
+
+use std::env;
+use std::process::Command;
+
+fn main() {
+    if env::args().skip(1).any(|s| s == "--child") {
+        child();
+    } else {
+        parent();
+    }
+}
+
+fn parent() {
+    let exe = env::current_exe().unwrap();
+    let dir = env::current_dir().unwrap();
+    let status = Command::new(&exe)
+        .arg("--child")
+        .current_dir(dir.canonicalize().unwrap())
+        .spawn()
+        .unwrap()
+        .wait()
+        .unwrap();
+    assert_eq!(status.code(), Some(0));
+}
+
+fn child() {
+    let current_dir = env::current_dir().unwrap();
+    let current_dir = current_dir.as_os_str().as_encoded_bytes();
+    assert!(!current_dir.starts_with(br"\\?\"));
+}
diff --git a/tests/ui/typeck/auxiliary/suggest-trait-reexported-as-not-doc-visible-a.rs b/tests/ui/typeck/auxiliary/suggest-trait-reexported-as-not-doc-visible-a.rs
new file mode 100644
index 0000000000000..87d83663626b5
--- /dev/null
+++ b/tests/ui/typeck/auxiliary/suggest-trait-reexported-as-not-doc-visible-a.rs
@@ -0,0 +1,5 @@
+//@ edition: 2021
+
+pub trait Foo {
+    fn foo();
+}
diff --git a/tests/ui/typeck/auxiliary/suggest-trait-reexported-as-not-doc-visible-b.rs b/tests/ui/typeck/auxiliary/suggest-trait-reexported-as-not-doc-visible-b.rs
new file mode 100644
index 0000000000000..9e90e0f0b356c
--- /dev/null
+++ b/tests/ui/typeck/auxiliary/suggest-trait-reexported-as-not-doc-visible-b.rs
@@ -0,0 +1,14 @@
+// ignore-tidy-linelength
+//@ edition: 2021
+//@ aux-crate:suggest_trait_reexported_as_not_doc_visible_a=suggest-trait-reexported-as-not-doc-visible-a.rs
+
+pub struct Bar;
+
+impl __DocHidden::Foo for Bar {
+    fn foo() {}
+}
+
+#[doc(hidden)]
+pub mod __DocHidden {
+    pub use suggest_trait_reexported_as_not_doc_visible_a::Foo;
+}
diff --git a/tests/ui/typeck/suggest-trait-reexported-as-not-doc-visible.rs b/tests/ui/typeck/suggest-trait-reexported-as-not-doc-visible.rs
new file mode 100644
index 0000000000000..3cb775e85ac6b
--- /dev/null
+++ b/tests/ui/typeck/suggest-trait-reexported-as-not-doc-visible.rs
@@ -0,0 +1,11 @@
+// ignore-tidy-linelength
+//@ edition: 2021
+//@ aux-crate:suggest_trait_reexported_as_not_doc_visible_a=suggest-trait-reexported-as-not-doc-visible-a.rs
+//@ aux-crate:suggest_trait_reexported_as_not_doc_visible_b=suggest-trait-reexported-as-not-doc-visible-b.rs
+
+use suggest_trait_reexported_as_not_doc_visible_b::Bar;
+
+fn main() {
+    Bar::foo();
+    //~^ ERROR: no function or associated item named `foo` found for struct `Bar` in the current scope [E0599]
+}
diff --git a/tests/ui/typeck/suggest-trait-reexported-as-not-doc-visible.stderr b/tests/ui/typeck/suggest-trait-reexported-as-not-doc-visible.stderr
new file mode 100644
index 0000000000000..9128ee6844469
--- /dev/null
+++ b/tests/ui/typeck/suggest-trait-reexported-as-not-doc-visible.stderr
@@ -0,0 +1,15 @@
+error[E0599]: no function or associated item named `foo` found for struct `Bar` in the current scope
+  --> $DIR/suggest-trait-reexported-as-not-doc-visible.rs:9:10
+   |
+LL |     Bar::foo();
+   |          ^^^ function or associated item not found in `Bar`
+   |
+   = help: items from traits can only be used if the trait is in scope
+help: trait `Foo` which provides `foo` is implemented but not in scope; perhaps you want to import it
+   |
+LL + use suggest_trait_reexported_as_not_doc_visible_a::Foo;
+   |
+
+error: aborting due to 1 previous error
+
+For more information about this error, try `rustc --explain E0599`.