diff --git a/library/compiler-builtins/.github/workflows/main.yaml b/library/compiler-builtins/.github/workflows/main.yaml
index c8faecfcb2cc7..38995cf0f0ff0 100644
--- a/library/compiler-builtins/.github/workflows/main.yaml
+++ b/library/compiler-builtins/.github/workflows/main.yaml
@@ -1,6 +1,6 @@
 name: CI
 on:
-  push: { branches: [master] }
+  push: { branches: [main] }
   pull_request:
 
 concurrency:
@@ -89,7 +89,7 @@ jobs:
         - target: x86_64-unknown-linux-gnu
           os: ubuntu-24.04
         - target: x86_64-apple-darwin
-          os: macos-13
+          os: macos-15-intel
         - target: i686-pc-windows-msvc
           os: windows-2025
         - target: x86_64-pc-windows-msvc
@@ -239,6 +239,8 @@ jobs:
         include:
         - target: x86_64-unknown-linux-gnu
           os: ubuntu-24.04
+        - target: aarch64-unknown-linux-gnu
+          os: ubuntu-24.04-arm
     runs-on: ${{ matrix.os }}
     steps:
     - uses: actions/checkout@master
@@ -247,13 +249,13 @@ jobs:
     - name: Set up dependencies
       run: |
         sudo apt-get update
-        sudo apt-get install -y valgrind gdb libc6-dbg # Needed for iai-callgrind
+        sudo apt-get install -y valgrind gdb libc6-dbg # Needed for gungraun
         rustup update "$BENCHMARK_RUSTC" --no-self-update
         rustup default "$BENCHMARK_RUSTC"
-        # Install the version of iai-callgrind-runner that is specified in Cargo.toml
-        iai_version="$(cargo metadata --format-version=1 --features icount |
-           jq -r '.packages[] | select(.name == "iai-callgrind").version')"
-        cargo binstall -y iai-callgrind-runner --version "$iai_version"
+        # Install the version of gungraun-runner that is specified in Cargo.toml
+        gungraun_version="$(cargo metadata --format-version=1 --features icount |
+           jq -r '.packages[] | select(.name == "gungraun").version')"
+        cargo binstall -y gungraun-runner --version "$gungraun_version"
         sudo apt-get install valgrind
     - uses: Swatinem/rust-cache@v2
       with:
diff --git a/library/compiler-builtins/.github/workflows/publish.yaml b/library/compiler-builtins/.github/workflows/publish.yaml
index 85a33c039d2a1..d6f1dc398e8ec 100644
--- a/library/compiler-builtins/.github/workflows/publish.yaml
+++ b/library/compiler-builtins/.github/workflows/publish.yaml
@@ -5,7 +5,7 @@ permissions:
   contents: write
 
 on:
-  push: { branches: [master] }
+  push: { branches: [main] }
 
 jobs:
   release-plz:
diff --git a/library/compiler-builtins/.github/workflows/rustc-pull.yml b/library/compiler-builtins/.github/workflows/rustc-pull.yml
index ad7693e17b0ee..617db14f46eea 100644
--- a/library/compiler-builtins/.github/workflows/rustc-pull.yml
+++ b/library/compiler-builtins/.github/workflows/rustc-pull.yml
@@ -17,7 +17,7 @@ jobs:
       zulip-stream-id: 219381
       zulip-topic: 'compiler-builtins subtree sync automation'
       zulip-bot-email: "compiler-builtins-ci-bot@rust-lang.zulipchat.com"
-      pr-base-branch: master
+      pr-base-branch: main
       branch-name: rustc-pull
     secrets:
       zulip-api-token: ${{ secrets.ZULIP_API_TOKEN }}
diff --git a/library/compiler-builtins/.gitignore b/library/compiler-builtins/.gitignore
index f12b871c2f783..abe346659d4c7 100644
--- a/library/compiler-builtins/.gitignore
+++ b/library/compiler-builtins/.gitignore
@@ -9,6 +9,7 @@ compiler-rt
 # Benchmark cache
 baseline-*
 iai-home
+gungraun-home
 
 # Temporary files
 *.bk
diff --git a/library/compiler-builtins/CONTRIBUTING.md b/library/compiler-builtins/CONTRIBUTING.md
index 9ae4f893c60d1..f74d3f8ba1276 100644
--- a/library/compiler-builtins/CONTRIBUTING.md
+++ b/library/compiler-builtins/CONTRIBUTING.md
@@ -150,8 +150,8 @@ cargo bench --no-default-features \
 ```
 
 There are also benchmarks that check instruction count behind the `icount`
-feature. These require [`iai-callgrind-runner`] (via Cargo) and [Valgrind]
-to be installed, which means these only run on limited platforms.
+feature. These require [`gungraun-runner`] (via Cargo) and [Valgrind] to be
+installed, which means these only run on limited platforms.
 
 Instruction count benchmarks are run as part of CI to flag performance
 regresions.
@@ -163,7 +163,7 @@ cargo bench --no-default-features \
     --bench icount --bench mem_icount
 ```
 
-[`iai-callgrind-runner`]: https://crates.io/crates/iai-callgrind-runner
+[`gungraun-runner`]: https://crates.io/crates/gungraun-runner
 [Valgrind]: https://valgrind.org/
 
 ## Subtree synchronization
diff --git a/library/compiler-builtins/Cargo.toml b/library/compiler-builtins/Cargo.toml
index 956d738f3b1f1..8501f4e630b55 100644
--- a/library/compiler-builtins/Cargo.toml
+++ b/library/compiler-builtins/Cargo.toml
@@ -51,5 +51,6 @@ codegen-units = 1
 lto = "fat"
 
 [profile.bench]
-# Required for iai-callgrind
+# Required for gungraun
 debug = true
+strip = false
diff --git a/library/compiler-builtins/PUBLISHING.md b/library/compiler-builtins/PUBLISHING.md
index 3df682ab04a4b..c521910641f55 100644
--- a/library/compiler-builtins/PUBLISHING.md
+++ b/library/compiler-builtins/PUBLISHING.md
@@ -5,7 +5,7 @@ It's not great, but it works for now. PRs to improve this process would be
 greatly appreciated!
 
 1. Make sure you've got a clean working tree and it's updated with the latest
-   changes on `master`
+   changes on `main`
 2. Edit `Cargo.toml` to bump the version number
 3. Commit this change
 4. Run `git tag` to create a tag for this version
diff --git a/library/compiler-builtins/builtins-shim/Cargo.toml b/library/compiler-builtins/builtins-shim/Cargo.toml
index 707ebdbc77b27..ac77224f5ce1e 100644
--- a/library/compiler-builtins/builtins-shim/Cargo.toml
+++ b/library/compiler-builtins/builtins-shim/Cargo.toml
@@ -11,7 +11,12 @@
 [package]
 name = "compiler_builtins"
 version = "0.1.160"
-authors = ["Jorge Aparicio <japaricious@gmail.com>"]
+authors = [
+    "Alex Crichton <alex@alexcrichton.com>",
+    "Amanieu d'Antras <amanieu@gmail.com>",
+    "Jorge Aparicio <japaricious@gmail.com>",
+    "Trevor Gross <tg@trevorgross.com>",
+]
 description = "Compiler intrinsics used by the Rust compiler."
 repository = "https://github.com/rust-lang/compiler-builtins"
 license = "MIT AND Apache-2.0 WITH LLVM-exception AND (MIT OR Apache-2.0)"
diff --git a/library/compiler-builtins/builtins-test/Cargo.toml b/library/compiler-builtins/builtins-test/Cargo.toml
index 00a9d8579d119..9346ea65420b2 100644
--- a/library/compiler-builtins/builtins-test/Cargo.toml
+++ b/library/compiler-builtins/builtins-test/Cargo.toml
@@ -1,7 +1,6 @@
 [package]
 name = "builtins-test"
 version = "0.1.0"
-authors = ["Alex Crichton <alex@alexcrichton.com>"]
 edition = "2024"
 publish = false
 license = "MIT AND Apache-2.0 WITH LLVM-exception AND (MIT OR Apache-2.0)"
@@ -14,7 +13,7 @@ rand_xoshiro = "0.7"
 # To compare float builtins against
 rustc_apfloat = "0.2.3"
 # Really a dev dependency, but dev dependencies can't be optional
-iai-callgrind = { version = "0.15.2", optional = true }
+gungraun = { version = "0.17.0", optional = true }
 
 [dependencies.compiler_builtins]
 path = "../builtins-shim"
@@ -46,8 +45,8 @@ no-sys-f16-f64-convert = []
 # Skip tests that rely on f16 symbols being available on the system
 no-sys-f16 = ["no-sys-f16-f64-convert"]
 
-# Enable icount benchmarks (requires iai-callgrind and valgrind)
-icount = ["dep:iai-callgrind"]
+# Enable icount benchmarks (requires gungraun-runner and valgrind locally)
+icount = ["dep:gungraun"]
 
 # Enable report generation without bringing in more dependencies by default
 benchmarking-reports = ["criterion/plotters", "criterion/html_reports"]
diff --git a/library/compiler-builtins/builtins-test/benches/mem_icount.rs b/library/compiler-builtins/builtins-test/benches/mem_icount.rs
index bd88cf80c7de2..37595e8258436 100644
--- a/library/compiler-builtins/builtins-test/benches/mem_icount.rs
+++ b/library/compiler-builtins/builtins-test/benches/mem_icount.rs
@@ -1,11 +1,11 @@
-//! Benchmarks that use Callgrind (via `iai_callgrind`) to report instruction count metrics. This
+//! Benchmarks that use Callgrind (via `gungraun`) to report instruction count metrics. This
 //! is stable enough to be tested in CI.
 
 use std::hint::black_box;
 use std::{ops, slice};
 
 use compiler_builtins::mem::{memcmp, memcpy, memmove, memset};
-use iai_callgrind::{library_benchmark, library_benchmark_group, main};
+use gungraun::{library_benchmark, library_benchmark_group, main};
 
 const PAGE_SIZE: usize = 0x1000; // 4 kiB
 const MAX_ALIGN: usize = 512; // assume we may use avx512 operations one day
@@ -108,7 +108,7 @@ mod mcpy {
         ],
         setup = setup,
     )]
-    fn bench((len, mut dst, src): (usize, AlignedSlice, AlignedSlice)) {
+    fn bench_cpy((len, mut dst, src): (usize, AlignedSlice, AlignedSlice)) {
         unsafe {
             black_box(memcpy(
                 black_box(dst.as_mut_ptr()),
@@ -118,7 +118,7 @@ mod mcpy {
         }
     }
 
-    library_benchmark_group!(name = memcpy; benchmarks = bench);
+    library_benchmark_group!(name = memcpy; benchmarks = bench_cpy);
 }
 
 mod mset {
@@ -157,7 +157,7 @@ mod mset {
         ],
         setup = setup,
     )]
-    fn bench((len, mut dst): (usize, AlignedSlice)) {
+    fn bench_set((len, mut dst): (usize, AlignedSlice)) {
         unsafe {
             black_box(memset(
                 black_box(dst.as_mut_ptr()),
@@ -167,7 +167,7 @@ mod mset {
         }
     }
 
-    library_benchmark_group!(name = memset; benchmarks = bench);
+    library_benchmark_group!(name = memset; benchmarks = bench_set);
 }
 
 mod mcmp {
@@ -225,7 +225,7 @@ mod mcmp {
         ],
         setup = setup
     )]
-    fn bench((len, mut dst, src): (usize, AlignedSlice, AlignedSlice)) {
+    fn bench_cmp((len, mut dst, src): (usize, AlignedSlice, AlignedSlice)) {
         unsafe {
             black_box(memcmp(
                 black_box(dst.as_mut_ptr()),
@@ -235,7 +235,7 @@ mod mcmp {
         }
     }
 
-    library_benchmark_group!(name = memcmp; benchmarks = bench);
+    library_benchmark_group!(name = memcmp; benchmarks = bench_cmp);
 }
 
 mod mmove {
@@ -384,7 +384,7 @@ mod mmove {
         ],
         setup = setup_forward
     )]
-    fn forward((len, spread, mut buf): (usize, usize, AlignedSlice)) {
+    fn forward_move((len, spread, mut buf): (usize, usize, AlignedSlice)) {
         // Test moving from the start of the buffer toward the end
         unsafe {
             black_box(memmove(
@@ -478,7 +478,7 @@ mod mmove {
         ],
         setup = setup_backward
     )]
-    fn backward((len, spread, mut buf): (usize, usize, AlignedSlice)) {
+    fn backward_move((len, spread, mut buf): (usize, usize, AlignedSlice)) {
         // Test moving from the end of the buffer toward the start
         unsafe {
             black_box(memmove(
@@ -489,7 +489,7 @@ mod mmove {
         }
     }
 
-    library_benchmark_group!(name = memmove; benchmarks = forward, backward);
+    library_benchmark_group!(name = memmove; benchmarks = forward_move, backward_move);
 }
 
 use mcmp::memcmp;
diff --git a/library/compiler-builtins/builtins-test/tests/lse.rs b/library/compiler-builtins/builtins-test/tests/lse.rs
index 5d59fbb7f44d2..56891be8a8ac1 100644
--- a/library/compiler-builtins/builtins-test/tests/lse.rs
+++ b/library/compiler-builtins/builtins-test/tests/lse.rs
@@ -19,7 +19,11 @@ mod cas {
                 let mut target = expected.wrapping_add(10);
                 assert_eq!(
                     unsafe {
-                        compiler_builtins::aarch64_linux::$name::$name(expected, new, &mut target)
+                        compiler_builtins::aarch64_outline_atomics::$name::$name(
+                            expected,
+                            new,
+                            &mut target,
+                        )
                     },
                     expected.wrapping_add(10),
                     "return value should always be the previous value",
@@ -33,7 +37,11 @@ mod cas {
                 target = expected;
                 assert_eq!(
                     unsafe {
-                        compiler_builtins::aarch64_linux::$name::$name(expected, new, &mut target)
+                        compiler_builtins::aarch64_outline_atomics::$name::$name(
+                            expected,
+                            new,
+                            &mut target,
+                        )
                     },
                     expected
                 );
@@ -54,7 +62,9 @@ mod swap {
             builtins_test::fuzz_2(10000, |left: super::int_ty!($bytes), mut right| {
                 let orig_right = right;
                 assert_eq!(
-                    unsafe { compiler_builtins::aarch64_linux::$name::$name(left, &mut right) },
+                    unsafe {
+                        compiler_builtins::aarch64_outline_atomics::$name::$name(left, &mut right)
+                    },
                     orig_right
                 );
                 assert_eq!(left, right);
@@ -74,7 +84,7 @@ macro_rules! test_op {
                             let mut target = old;
                             let op: fn(super::int_ty!($bytes), super::int_ty!($bytes)) -> _ = $($op)*;
                             let expected = op(old, val);
-                            assert_eq!(old, unsafe { compiler_builtins::aarch64_linux::$name::$name(val, &mut target) }, "{} should return original value", stringify!($name));
+                            assert_eq!(old, unsafe { compiler_builtins::aarch64_outline_atomics::$name::$name(val, &mut target) }, "{} should return original value", stringify!($name));
                             assert_eq!(expected, target, "{} should store to target", stringify!($name));
                         });
                     }
diff --git a/library/compiler-builtins/ci/bench-icount.sh b/library/compiler-builtins/ci/bench-icount.sh
index 12228b9da971b..6d92b50a6dae7 100755
--- a/library/compiler-builtins/ci/bench-icount.sh
+++ b/library/compiler-builtins/ci/bench-icount.sh
@@ -10,35 +10,43 @@ if [ -z "$target" ]; then
     target="$host_target"
 fi
 
-iai_home="iai-home"
+# Print machine information
+uname -a
+lscpu || true
+
+gungraun_home="gungraun-home"
 
 # Use the arch as a tag to disambiguate artifacts
 tag="$(echo "$target" | cut -d'-' -f1)"
 
-# Download the baseline from master
+# Download the baseline from main
 ./ci/ci-util.py locate-baseline --download --extract --tag "$tag"
 
+# FIXME: migration from iai-named baselines to gungraun, can be dropped
+# after the first run with gungraun.
+[ -d "iai-home" ] && mv "iai-home" "$gungraun_home"
+
 # Run benchmarks once
 function run_icount_benchmarks() {
     cargo_args=(
-        "--bench" "icount"
+        "--bench" "*icount*"
         "--no-default-features"
         "--features" "unstable,unstable-float,icount"
     )
 
-    iai_args=(
-        "--home" "$(pwd)/$iai_home"
-        "--callgrind-limits=ir=5.0"
+    gungraun_args=(
+        "--home" "$(pwd)/$gungraun_home"
+        "--callgrind-limits=ir=5.0%"
         "--save-summary"
     )
 
-    # Parse `cargo_arg0 cargo_arg1 -- iai_arg0 iai_arg1` syntax
-    parsing_iai_args=0
+    # Parse `cargo_arg0 cargo_arg1 -- gungraun_arg0 gungraun_arg1` syntax
+    parsing_gungraun_args=0
     while [ "$#" -gt 0 ]; do
-        if [ "$parsing_iai_args" == "1" ]; then
-            iai_args+=("$1")
+        if [ "$parsing_gungraun_args" == "1" ]; then
+            gungraun_args+=("$1")
         elif [ "$1" == "--" ]; then
-            parsing_iai_args=1
+            parsing_gungraun_args=1
         else
             cargo_args+=("$1")
         fi
@@ -46,9 +54,9 @@ function run_icount_benchmarks() {
         shift
     done
 
-    # Run iai-callgrind benchmarks. Do this in a subshell with `&& true` to
-    # capture rather than exit on error.
-    (cargo bench "${cargo_args[@]}" -- "${iai_args[@]}") && true
+    # Run gungraun benchmarks. Do this in a subshell with `&& true` to capture
+    # rather than exit on error.
+    (cargo bench "${cargo_args[@]}" -- "${gungraun_args[@]}") && true
     exit_code="$?"
 
     if [ "$exit_code" -eq 0 ]; then
@@ -68,4 +76,4 @@ run_icount_benchmarks -- --save-baseline=hardfloat
 # Name and tar the new baseline
 name="baseline-icount-$tag-$(date -u +'%Y%m%d%H%M')-${GITHUB_SHA:0:12}"
 echo "BASELINE_NAME=$name" >>"$GITHUB_ENV"
-tar cJf "$name.tar.xz" "$iai_home"
+tar cJf "$name.tar.xz" "$gungraun_home"
diff --git a/library/compiler-builtins/ci/ci-util.py b/library/compiler-builtins/ci/ci-util.py
index c1db17c6c9010..ef9ce455178ec 100755
--- a/library/compiler-builtins/ci/ci-util.py
+++ b/library/compiler-builtins/ci/ci-util.py
@@ -38,7 +38,7 @@
             `--tag` can be specified to look for artifacts with a specific tag, such as
             for a specific architecture.
 
-            Note that `--extract` will overwrite files in `iai-home`.
+            Note that `--extract` will overwrite files in `gungraun-home`.
 
         handle-bench-regressions PR_NUMBER
             Exit with success if the pull request contains a line starting with
@@ -49,7 +49,7 @@
 
 REPO_ROOT = Path(__file__).parent.parent
 GIT = ["git", "-C", REPO_ROOT]
-DEFAULT_BRANCH = "master"
+DEFAULT_BRANCH = "main"
 WORKFLOW_NAME = "CI"  # Workflow that generates the benchmark artifacts
 ARTIFACT_PREFIX = "baseline-icount*"
 
@@ -186,7 +186,7 @@ def __init__(self) -> None:
 
     def _init_change_list(self):
         """Create a list of files that have been changed. This uses GITHUB_REF if
-        available, otherwise a diff between `HEAD` and `master`.
+        available, otherwise a diff between `HEAD` and `main`.
         """
 
         # For pull requests, GitHub creates a ref `refs/pull/1234/merge` (1234 being
@@ -390,6 +390,7 @@ def locate_baseline(flags: list[str]) -> None:
 
     artifact_glob = f"{ARTIFACT_PREFIX}{f"-{tag}" if tag else ""}*"
 
+    # Skip checking because this will fail if the file already exists, which is fine.
     sp.run(
         ["gh", "run", "download", str(job_id), f"--pattern={artifact_glob}"],
         check=False,
@@ -409,7 +410,17 @@ def locate_baseline(flags: list[str]) -> None:
     candidate_baselines.sort(reverse=True)
     baseline_archive = candidate_baselines[0]
     eprint(f"extracting {baseline_archive}")
-    sp.run(["tar", "xJvf", baseline_archive], check=True)
+
+    all_paths = sp.check_output(["tar", "tJf", baseline_archive], encoding="utf8")
+    sp.run(["tar", "xJf", baseline_archive], check=True)
+
+    # Print a short summary of paths, we don't use `tar v` since the list is huge
+    short_paths = re.findall(r"^(?:[^/\n]+/?){1,3}", all_paths, re.MULTILINE)
+
+    print("Extracted:")
+    for path in sorted(set(short_paths)):
+        print(f"* {path}")
+
     eprint("baseline extracted successfully")
 
 
diff --git a/library/compiler-builtins/compiler-builtins/Cargo.toml b/library/compiler-builtins/compiler-builtins/Cargo.toml
index 8bbe136ce33e3..0845861dcfe3c 100644
--- a/library/compiler-builtins/compiler-builtins/Cargo.toml
+++ b/library/compiler-builtins/compiler-builtins/Cargo.toml
@@ -7,7 +7,12 @@
 [package]
 name = "compiler_builtins"
 version = "0.1.160"
-authors = ["Jorge Aparicio <japaricious@gmail.com>"]
+authors = [
+    "Alex Crichton <alex@alexcrichton.com>",
+    "Amanieu d'Antras <amanieu@gmail.com>",
+    "Jorge Aparicio <japaricious@gmail.com>",
+    "Trevor Gross <tg@trevorgross.com>",
+]
 description = "Compiler intrinsics used by the Rust compiler."
 repository = "https://github.com/rust-lang/compiler-builtins"
 license = "MIT AND Apache-2.0 WITH LLVM-exception AND (MIT OR Apache-2.0)"
diff --git a/library/compiler-builtins/compiler-builtins/src/int/specialized_div_rem/mod.rs b/library/compiler-builtins/compiler-builtins/src/int/specialized_div_rem/mod.rs
index 7841e4f33cd66..5ffe1f59b4db6 100644
--- a/library/compiler-builtins/compiler-builtins/src/int/specialized_div_rem/mod.rs
+++ b/library/compiler-builtins/compiler-builtins/src/int/specialized_div_rem/mod.rs
@@ -196,13 +196,12 @@ unsafe fn u128_by_u64_div_rem(duo: u128, div: u64) -> (u64, u64) {
     unsafe {
         // divides the combined registers rdx:rax (`duo` is split into two 64 bit parts to do this)
         // by `div`. The quotient is stored in rax and the remainder in rdx.
-        // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
         core::arch::asm!(
             "div {0}",
             in(reg) div,
             inlateout("rax") duo_lo => quo,
             inlateout("rdx") duo_hi => rem,
-            options(att_syntax, pure, nomem, nostack)
+            options(pure, nomem, nostack),
         );
     }
     (quo, rem)
@@ -283,13 +282,12 @@ unsafe fn u64_by_u32_div_rem(duo: u64, div: u32) -> (u32, u32) {
     unsafe {
         // divides the combined registers rdx:rax (`duo` is split into two 32 bit parts to do this)
         // by `div`. The quotient is stored in rax and the remainder in rdx.
-        // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
         core::arch::asm!(
             "div {0}",
             in(reg) div,
             inlateout("rax") duo_lo => quo,
             inlateout("rdx") duo_hi => rem,
-            options(att_syntax, pure, nomem, nostack)
+            options(pure, nomem, nostack),
         );
     }
     (quo, rem)
diff --git a/library/compiler-builtins/compiler-builtins/src/mem/x86_64.rs b/library/compiler-builtins/compiler-builtins/src/mem/x86_64.rs
index fb29eb11b231e..bf36a286ac951 100644
--- a/library/compiler-builtins/compiler-builtins/src/mem/x86_64.rs
+++ b/library/compiler-builtins/compiler-builtins/src/mem/x86_64.rs
@@ -22,13 +22,12 @@ use core::{intrinsics, mem};
 #[inline(always)]
 #[cfg(target_feature = "ermsb")]
 pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
-    // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
-    core::arch::asm!(
-        "repe movsb (%rsi), (%rdi)",
+    asm!(
+        "rep movsb [rdi], [rsi]",
         inout("rcx") count => _,
         inout("rdi") dest => _,
         inout("rsi") src => _,
-        options(att_syntax, nostack, preserves_flags)
+        options(nostack, preserves_flags)
     );
 }
 
@@ -42,21 +41,21 @@ pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, count: usize)
         inout("ecx") pre_byte_count => _,
         inout("rdi") dest => dest,
         inout("rsi") src => src,
-        options(att_syntax, nostack, preserves_flags)
+        options(nostack, preserves_flags)
     );
     asm!(
         "rep movsq",
         inout("rcx") qword_count => _,
         inout("rdi") dest => dest,
         inout("rsi") src => src,
-        options(att_syntax, nostack, preserves_flags)
+        options(nostack, preserves_flags)
     );
     asm!(
         "rep movsb",
         inout("ecx") byte_count => _,
         inout("rdi") dest => _,
         inout("rsi") src => _,
-        options(att_syntax, nostack, preserves_flags)
+        options(nostack, preserves_flags)
     );
 }
 
@@ -67,14 +66,13 @@ pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) {
     asm!(
         "std",
         "rep movsb",
-        "sub $7, %rsi",
-        "sub $7, %rdi",
-        "mov {qword_count:r}, %rcx",
+        "sub rsi, 7",
+        "sub rdi, 7",
+        "mov rcx, {qword_count:r}",
         "rep movsq",
-        "test {pre_byte_count:e}, {pre_byte_count:e}",
-        "add $7, %rsi",
-        "add $7, %rdi",
-        "mov {pre_byte_count:e}, %ecx",
+        "add rsi, 7",
+        "add rdi, 7",
+        "mov ecx, {pre_byte_count:e}",
         "rep movsb",
         "cld",
         pre_byte_count = in(reg) pre_byte_count,
@@ -82,21 +80,19 @@ pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) {
         inout("ecx") byte_count => _,
         inout("rdi") dest.add(count - 1) => _,
         inout("rsi") src.add(count - 1) => _,
-        // We modify flags, but we restore it afterwards
-        options(att_syntax, nostack, preserves_flags)
+        options(nostack)
     );
 }
 
 #[inline(always)]
 #[cfg(target_feature = "ermsb")]
 pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
-    // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
-    core::arch::asm!(
-        "repe stosb %al, (%rdi)",
+    asm!(
+        "rep stosb [rdi], al",
         inout("rcx") count => _,
         inout("rdi") dest => _,
         inout("al") c => _,
-        options(att_syntax, nostack, preserves_flags)
+        options(nostack, preserves_flags)
     )
 }
 
@@ -111,21 +107,21 @@ pub unsafe fn set_bytes(mut dest: *mut u8, c: u8, count: usize) {
         inout("ecx") pre_byte_count => _,
         inout("rdi") dest => dest,
         in("rax") c,
-        options(att_syntax, nostack, preserves_flags)
+        options(nostack, preserves_flags)
     );
     asm!(
         "rep stosq",
         inout("rcx") qword_count => _,
         inout("rdi") dest => dest,
         in("rax") c,
-        options(att_syntax, nostack, preserves_flags)
+        options(nostack, preserves_flags)
     );
     asm!(
         "rep stosb",
         inout("ecx") byte_count => _,
         inout("rdi") dest => _,
         in("rax") c,
-        options(att_syntax, nostack, preserves_flags)
+        options(nostack, preserves_flags)
     );
 }
 
@@ -212,10 +208,10 @@ pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize {
     let x = {
         let r;
         asm!(
-            "movdqa ({addr:r}), {dest}",
+            "movdqa {dest}, [{addr:r}]",
             addr = in(reg) s,
             dest = out(xmm_reg) r,
-            options(att_syntax, nostack),
+            options(nostack, preserves_flags),
         );
         r
     };
@@ -232,10 +228,10 @@ pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize {
         let x = {
             let r;
             asm!(
-                "movdqa ({addr:r}), {dest}",
+                "movdqa {dest}, [{addr:r}]",
                 addr = in(reg) s,
                 dest = out(xmm_reg) r,
-                options(att_syntax, nostack),
+                options(nostack, preserves_flags),
             );
             r
         };
@@ -277,10 +273,10 @@ pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize {
         let mut cs = {
             let r: u64;
             asm!(
-                "mov ({addr}), {dest}",
+                "mov {dest}, [{addr}]",
                 addr = in(reg) s,
                 dest = out(reg) r,
-                options(att_syntax, nostack),
+                options(nostack, preserves_flags),
             );
             r
         };
diff --git a/library/compiler-builtins/compiler-builtins/src/probestack.rs b/library/compiler-builtins/compiler-builtins/src/probestack.rs
index 72975485a7765..1cab64ea113c5 100644
--- a/library/compiler-builtins/compiler-builtins/src/probestack.rs
+++ b/library/compiler-builtins/compiler-builtins/src/probestack.rs
@@ -47,11 +47,11 @@
 // We only define stack probing for these architectures today.
 #![cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 
-// Our goal here is to touch each page between %rsp+8 and %rsp+8-%rax,
+// Our goal here is to touch each page between `rsp+8` and `rsp+8-rax`,
 // ensuring that if any pages are unmapped we'll make a page fault.
 //
-// The ABI here is that the stack frame size is located in `%rax`. Upon
-// return we're not supposed to modify `%rsp` or `%rax`.
+// The ABI here is that the stack frame size is located in `rax`. Upon
+// return we're not supposed to modify `rsp` or `rax`.
 #[cfg(target_arch = "x86_64")]
 #[unsafe(naked)]
 #[rustc_std_internal_symbol]
@@ -59,50 +59,50 @@ pub unsafe extern "custom" fn __rust_probestack() {
     core::arch::naked_asm!(
         "
             .cfi_startproc
-            pushq  %rbp
+            push  rbp
             .cfi_adjust_cfa_offset 8
-            .cfi_offset %rbp, -16
-            movq   %rsp, %rbp
-            .cfi_def_cfa_register %rbp
+            .cfi_offset rbp, -16
+            mov   rbp, rsp
+            .cfi_def_cfa_register rbp
 
-            mov    %rax,%r11        // duplicate %rax as we're clobbering %r11
+            mov    r11, rax        // duplicate rax as we're clobbering r11
 
             // Main loop, taken in one page increments. We're decrementing rsp by
             // a page each time until there's less than a page remaining. We're
             // guaranteed that this function isn't called unless there's more than a
             // page needed.
             //
-            // Note that we're also testing against `8(%rsp)` to account for the 8
+            // Note that we're also testing against `[rsp + 8]` to account for the 8
             // bytes pushed on the stack originally with our return address. Using
-            // `8(%rsp)` simulates us testing the stack pointer in the caller's
+            // `[rsp + 8]` simulates us testing the stack pointer in the caller's
             // context.
 
-            // It's usually called when %rax >= 0x1000, but that's not always true.
+            // It's usually called when rax >= 0x1000, but that's not always true.
             // Dynamic stack allocation, which is needed to implement unsized
-            // rvalues, triggers stackprobe even if %rax < 0x1000.
-            // Thus we have to check %r11 first to avoid segfault.
-            cmp    $0x1000,%r11
+            // rvalues, triggers stackprobe even if rax < 0x1000.
+            // Thus we have to check r11 first to avoid segfault.
+            cmp    r11, 0x1000
             jna    3f
         2:
-            sub    $0x1000,%rsp
-            test   %rsp,8(%rsp)
-            sub    $0x1000,%r11
-            cmp    $0x1000,%r11
+            sub    rsp, 0x1000
+            test   qword ptr [rsp + 8], rsp
+            sub    r11, 0x1000
+            cmp    r11, 0x1000
             ja     2b
 
         3:
             // Finish up the last remaining stack space requested, getting the last
             // bits out of r11
-            sub    %r11,%rsp
-            test   %rsp,8(%rsp)
+            sub    rsp, r11
+            test   qword ptr [rsp + 8], rsp
 
             // Restore the stack pointer to what it previously was when entering
             // this function. The caller will readjust the stack pointer after we
             // return.
-            add    %rax,%rsp
+            add    rsp, rax
 
             leave
-            .cfi_def_cfa_register %rsp
+            .cfi_def_cfa_register rsp
             .cfi_adjust_cfa_offset -8
     ",
     #[cfg(not(all(target_env = "sgx", target_vendor = "fortanix")))]
@@ -112,14 +112,13 @@ pub unsafe extern "custom" fn __rust_probestack() {
             // for this target, [manually patch for LVI].
             //
             // [manually patch for LVI]: https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection#specialinstructions
-            pop %r11
+            pop r11
             lfence
-            jmp *%r11
+            jmp r11
     ",
     "
             .cfi_endproc
     ",
-        options(att_syntax)
     )
 }
 
@@ -135,36 +134,35 @@ pub unsafe extern "custom" fn __rust_probestack() {
     core::arch::naked_asm!(
         "
             .cfi_startproc
-            push   %ebp
+            push   ebp
             .cfi_adjust_cfa_offset 4
-            .cfi_offset %ebp, -8
-            mov    %esp, %ebp
-            .cfi_def_cfa_register %ebp
-            push   %ecx
-            mov    %eax,%ecx
+            .cfi_offset ebp, -8
+            mov    ebp, esp
+            .cfi_def_cfa_register ebp
+            push   ecx
+            mov    ecx, eax
 
-            cmp    $0x1000,%ecx
+            cmp    ecx, 0x1000
             jna    3f
         2:
-            sub    $0x1000,%esp
-            test   %esp,8(%esp)
-            sub    $0x1000,%ecx
-            cmp    $0x1000,%ecx
+            sub    esp, 0x1000
+            test   dword ptr [esp + 8], esp
+            sub    ecx, 0x1000
+            cmp    ecx, 0x1000
             ja     2b
 
         3:
-            sub    %ecx,%esp
-            test   %esp,8(%esp)
+            sub    esp, ecx
+            test   dword ptr [esp + 8], esp
 
-            add    %eax,%esp
-            pop    %ecx
+            add    esp, eax
+            pop    ecx
             leave
-            .cfi_def_cfa_register %esp
+            .cfi_def_cfa_register esp
             .cfi_adjust_cfa_offset -4
             ret
             .cfi_endproc
-    ",
-        options(att_syntax)
+        ",
     )
 }
 
@@ -176,8 +174,8 @@ pub unsafe extern "custom" fn __rust_probestack() {
 // REF: Rust commit(74e80468347)
 // rust\src\llvm-project\llvm\lib\Target\X86\X86FrameLowering.cpp: 805
 // Comments in LLVM:
-//   MSVC x32's _chkstk and cygwin/mingw's _alloca adjust %esp themselves.
-//   MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp
+//   MSVC x32's _chkstk and cygwin/mingw's _alloca adjust esp themselves.
+//   MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust `rsp`
 //   themselves.
 #[unsafe(naked)]
 #[rustc_std_internal_symbol]
@@ -185,40 +183,39 @@ pub unsafe extern "custom" fn __rust_probestack() {
     core::arch::naked_asm!(
         "
             .cfi_startproc
-            push   %ebp
+            push   ebp
             .cfi_adjust_cfa_offset 4
-            .cfi_offset %ebp, -8
-            mov    %esp, %ebp
-            .cfi_def_cfa_register %ebp
-            push   %ecx
-            push   %edx
-            mov    %eax,%ecx
-
-            cmp    $0x1000,%ecx
+            .cfi_offset ebp, -8
+            mov    ebp, esp
+            .cfi_def_cfa_register ebp
+            push   ecx
+            push   edx
+            mov    ecx, eax
+
+            cmp    ecx, 0x1000
             jna    3f
         2:
-            sub    $0x1000,%esp
-            test   %esp,8(%esp)
-            sub    $0x1000,%ecx
-            cmp    $0x1000,%ecx
+            sub    esp, 0x1000
+            test   dword ptr [esp + 8], esp
+            sub    ecx, 0x1000
+            cmp    ecx, 0x1000
             ja     2b
 
         3:
-            sub    %ecx,%esp
-            test   %esp,8(%esp)
-            mov    4(%ebp),%edx
-            mov    %edx, 12(%esp)
-            add    %eax,%esp
-            pop    %edx
-            pop    %ecx
+            sub    esp, ecx
+            test   dword ptr [esp + 8], esp
+            mov    edx, dword ptr [ebp + 4]
+            mov    dword ptr [esp + 12], edx
+            add    esp, eax
+            pop    edx
+            pop    ecx
             leave
 
-            sub   %eax, %esp
-            .cfi_def_cfa_register %esp
+            sub   esp, eax
+            .cfi_def_cfa_register esp
             .cfi_adjust_cfa_offset -4
             ret
             .cfi_endproc
-    ",
-        options(att_syntax)
+        ",
     )
 }
diff --git a/library/compiler-builtins/compiler-builtins/src/x86.rs b/library/compiler-builtins/compiler-builtins/src/x86.rs
index 51940b3b338a2..1a3c418609451 100644
--- a/library/compiler-builtins/compiler-builtins/src/x86.rs
+++ b/library/compiler-builtins/compiler-builtins/src/x86.rs
@@ -22,26 +22,25 @@ intrinsics! {
     pub unsafe extern "custom" fn _alloca() {
         // __chkstk and _alloca are the same function
         core::arch::naked_asm!(
-            "push   %ecx",
-            "cmp    $0x1000,%eax",
-            "lea    8(%esp),%ecx", // esp before calling this routine -> ecx
-            "jb     1f",
+            "push   ecx",
+            "cmp    eax, 0x1000",
+            "lea    ecx, [esp + 8]", // esp before calling this routine -> ecx
+            "jb     3f",
             "2:",
-            "sub    $0x1000,%ecx",
-            "test   %ecx,(%ecx)",
-            "sub    $0x1000,%eax",
-            "cmp    $0x1000,%eax",
+            "sub    ecx, 0x1000",
+            "test   [ecx], ecx",
+            "sub    eax, 0x1000",
+            "cmp    eax, 0x1000",
             "ja     2b",
-            "1:",
-            "sub    %eax,%ecx",
-            "test   %ecx,(%ecx)",
-            "lea    4(%esp),%eax",  // load pointer to the return address into eax
-            "mov    %ecx,%esp",     // install the new top of stack pointer into esp
-            "mov    -4(%eax),%ecx", // restore ecx
-            "push   (%eax)",        // push return address onto the stack
-            "sub    %esp,%eax",     // restore the original value in eax
+            "3:",
+            "sub    ecx, eax",
+            "test   [ecx], ecx",
+            "lea    eax, [esp + 4]", // load pointer to the return address into eax
+            "mov    esp, ecx",       // install the new top of stack pointer into esp
+            "mov    ecx, [eax - 4]", // restore ecx
+            "push   [eax]",          // push return address onto the stack
+            "sub    eax, esp",       // restore the original value in eax
             "ret",
-            options(att_syntax)
         );
     }
 }
diff --git a/library/compiler-builtins/compiler-builtins/src/x86_64.rs b/library/compiler-builtins/compiler-builtins/src/x86_64.rs
index f9ae784d57520..99a527ee9ac5e 100644
--- a/library/compiler-builtins/compiler-builtins/src/x86_64.rs
+++ b/library/compiler-builtins/compiler-builtins/src/x86_64.rs
@@ -12,24 +12,23 @@ intrinsics! {
     #[cfg(any(all(windows, target_env = "gnu"), target_os = "cygwin", target_os = "uefi"))]
     pub unsafe extern "custom" fn ___chkstk_ms() {
         core::arch::naked_asm!(
-            "push   %rcx",
-            "push   %rax",
-            "cmp    $0x1000,%rax",
-            "lea    24(%rsp),%rcx",
-            "jb     1f",
+            "push   rcx",
+            "push   rax",
+            "cmp    rax, 0x1000",
+            "lea    rcx, [rsp + 24]",
+            "jb     3f",
             "2:",
-            "sub    $0x1000,%rcx",
-            "test   %rcx,(%rcx)",
-            "sub    $0x1000,%rax",
-            "cmp    $0x1000,%rax",
+            "sub    rcx, 0x1000",
+            "test   [rcx], rcx",
+            "sub    rax, 0x1000",
+            "cmp    rax, 0x1000",
             "ja     2b",
-            "1:",
-            "sub    %rax,%rcx",
-            "test   %rcx,(%rcx)",
-            "pop    %rax",
-            "pop    %rcx",
+            "3:",
+            "sub    rcx, rax",
+            "test   [rcx], rcx",
+            "pop    rax",
+            "pop    rcx",
             "ret",
-            options(att_syntax)
         );
     }
 }
diff --git a/library/compiler-builtins/crates/panic-handler/Cargo.toml b/library/compiler-builtins/crates/panic-handler/Cargo.toml
index a6764fc481b64..70898368d8e73 100644
--- a/library/compiler-builtins/crates/panic-handler/Cargo.toml
+++ b/library/compiler-builtins/crates/panic-handler/Cargo.toml
@@ -1,7 +1,6 @@
 [package]
 name = "panic-handler"
 version = "0.1.0"
-authors = ["Alex Crichton <alex@alexcrichton.com>"]
 edition = "2024"
 publish = false
 
diff --git a/library/compiler-builtins/libm-test/Cargo.toml b/library/compiler-builtins/libm-test/Cargo.toml
index 0af6b0c1da5ca..adecfc1af6b87 100644
--- a/library/compiler-builtins/libm-test/Cargo.toml
+++ b/library/compiler-builtins/libm-test/Cargo.toml
@@ -21,8 +21,8 @@ build-musl = ["dep:musl-math-sys"]
 # Enable report generation without bringing in more dependencies by default
 benchmarking-reports = ["criterion/plotters", "criterion/html_reports"]
 
-# Enable icount benchmarks (requires iai-callgrind and valgrind)
-icount = ["dep:iai-callgrind"]
+# Enable icount benchmarks (requires gungraun-runner and valgrind locally)
+icount = ["dep:gungraun"]
 
 # Run with a reduced set of benchmarks, such as for CI
 short-benchmarks = []
@@ -31,7 +31,7 @@ short-benchmarks = []
 anyhow = "1.0.98"
 # This is not directly used but is required so we can enable `gmp-mpfr-sys/force-cross`.
 gmp-mpfr-sys = { version = "1.6.5", optional = true, default-features = false }
-iai-callgrind = { version = "0.15.2", optional = true }
+gungraun = { version = "0.17.0", optional = true }
 indicatif = { version = "0.18.0", default-features = false }
 libm = { path = "../libm", features = ["unstable-public-internals"] }
 libm-macros = { path = "../crates/libm-macros" }
diff --git a/library/compiler-builtins/libm-test/benches/icount.rs b/library/compiler-builtins/libm-test/benches/icount.rs
index 0b85771225dde..fb856d9be4517 100644
--- a/library/compiler-builtins/libm-test/benches/icount.rs
+++ b/library/compiler-builtins/libm-test/benches/icount.rs
@@ -1,10 +1,10 @@
-//! Benchmarks that use `iai-cachegrind` to be reasonably CI-stable.
+//! Benchmarks that use `gungraun` to be reasonably CI-stable.
 #![feature(f16)]
 #![feature(f128)]
 
 use std::hint::black_box;
 
-use iai_callgrind::{library_benchmark, library_benchmark_group, main};
+use gungraun::{library_benchmark, library_benchmark_group, main};
 use libm::support::{HInt, Hexf, hf16, hf32, hf64, hf128, u256};
 use libm_test::generate::spaced;
 use libm_test::{CheckBasis, CheckCtx, GeneratorKind, MathOp, OpRustArgs, TupleCall, op};
@@ -156,7 +156,13 @@ fn icount_bench_u256_shr(cases: Vec<(u256, u32)>) {
 
 library_benchmark_group!(
     name = icount_bench_u128_group;
-    benchmarks = icount_bench_u128_widen_mul, icount_bench_u256_narrowing_div, icount_bench_u256_add, icount_bench_u256_sub, icount_bench_u256_shl, icount_bench_u256_shr
+    benchmarks =
+    icount_bench_u128_widen_mul,
+    icount_bench_u256_narrowing_div,
+    icount_bench_u256_add,
+    icount_bench_u256_sub,
+    icount_bench_u256_shl,
+    icount_bench_u256_shr
 );
 
 #[library_benchmark]
diff --git a/library/compiler-builtins/libm-test/src/precision.rs b/library/compiler-builtins/libm-test/src/precision.rs
index c441922d302b9..7887c032394b8 100644
--- a/library/compiler-builtins/libm-test/src/precision.rs
+++ b/library/compiler-builtins/libm-test/src/precision.rs
@@ -83,6 +83,19 @@ pub fn default_ulp(ctx: &CheckCtx) -> u32 {
         Bn::Tgamma => 20,
     };
 
+    // These have a separate implementation on i586
+    if cfg!(x86_no_sse) {
+        match ctx.fn_ident {
+            Id::Exp => ulp = 1,
+            Id::Exp2 => ulp = 1,
+            Id::Exp10 => ulp = 1,
+            Id::Expf => ulp = 0,
+            Id::Exp2f => ulp = 0,
+            Id::Exp10f => ulp = 0,
+            _ => (),
+        }
+    }
+
     // There are some cases where musl's approximation is less accurate than ours. For these
     // cases, increase the ULP.
     if ctx.basis == Musl {
@@ -98,6 +111,8 @@ pub fn default_ulp(ctx: &CheckCtx) -> u32 {
             Id::Cbrt => ulp = 2,
             // FIXME(#401): musl has an incorrect result here.
             Id::Fdim => ulp = 2,
+            Id::Exp2f => ulp = 1,
+            Id::Expf => ulp = 1,
             Id::Sincosf => ulp = 500,
             Id::Tgamma => ulp = 20,
             _ => (),
@@ -124,8 +139,6 @@ pub fn default_ulp(ctx: &CheckCtx) -> u32 {
             Id::Asinh => ulp = 3,
             Id::Asinhf => ulp = 3,
             Id::Cbrt => ulp = 1,
-            Id::Exp10 | Id::Exp10f => ulp = 1_000_000,
-            Id::Exp2 | Id::Exp2f => ulp = 10_000_000,
             Id::Log1p | Id::Log1pf => ulp = 2,
             Id::Tan => ulp = 2,
             _ => (),
@@ -205,36 +218,6 @@ impl MaybeOverride<(f16,)> for SpecialCase {}
 
 impl MaybeOverride<(f32,)> for SpecialCase {
     fn check_float<F: Float>(input: (f32,), actual: F, expected: F, ctx: &CheckCtx) -> CheckAction {
-        if ctx.base_name == BaseName::Expm1
-            && !input.0.is_infinite()
-            && input.0 > 80.0
-            && actual.is_infinite()
-            && !expected.is_infinite()
-        {
-            // we return infinity but the number is representable
-            if ctx.basis == CheckBasis::Musl {
-                return XFAIL_NOCHECK;
-            }
-            return XFAIL("expm1 representable numbers");
-        }
-
-        if cfg!(x86_no_sse)
-            && ctx.base_name == BaseName::Exp2
-            && !expected.is_infinite()
-            && actual.is_infinite()
-        {
-            // We return infinity when there is a representable value. Test input: 127.97238
-            return XFAIL("586 exp2 representable numbers");
-        }
-
-        if ctx.base_name == BaseName::Sinh && input.0.abs() > 80.0 && actual.is_nan() {
-            // we return some NaN that should be real values or infinite
-            if ctx.basis == CheckBasis::Musl {
-                return XFAIL_NOCHECK;
-            }
-            return XFAIL("sinh unexpected NaN");
-        }
-
         if (ctx.base_name == BaseName::Lgamma || ctx.base_name == BaseName::LgammaR)
             && input.0 > 4e36
             && expected.is_infinite()
@@ -278,14 +261,6 @@ impl MaybeOverride<(f64,)> for SpecialCase {
             return XFAIL("i586 rint rounding mode");
         }
 
-        if cfg!(x86_no_sse)
-            && (ctx.fn_ident == Identifier::Exp10 || ctx.fn_ident == Identifier::Exp2)
-        {
-            // FIXME: i586 has very imprecise results with ULP > u32::MAX for these
-            // operations so we can't reasonably provide a limit.
-            return XFAIL_NOCHECK;
-        }
-
         if ctx.base_name == BaseName::J0 && input.0 < -1e300 {
             // Errors get huge close to -inf
             return XFAIL_NOCHECK;
diff --git a/library/compiler-builtins/libm/Cargo.toml b/library/compiler-builtins/libm/Cargo.toml
index 63b4d3c277989..5b5ca34fd2c9e 100644
--- a/library/compiler-builtins/libm/Cargo.toml
+++ b/library/compiler-builtins/libm/Cargo.toml
@@ -1,7 +1,12 @@
 [package]
 name = "libm"
 version = "0.2.15"
-authors = ["Jorge Aparicio <jorge@japaric.io>"]
+authors = [
+    "Alex Crichton <alex@alexcrichton.com>",
+    "Amanieu d'Antras <amanieu@gmail.com>",
+    "Jorge Aparicio <japaricious@gmail.com>",
+    "Trevor Gross <tg@trevorgross.com>",
+]
 description = "libm in pure Rust"
 categories = ["no-std"]
 keywords = ["libm", "math"]
diff --git a/library/compiler-builtins/libm/src/math/arch/i586.rs b/library/compiler-builtins/libm/src/math/arch/i586.rs
index b9a66762063db..d9bb93fbf5852 100644
--- a/library/compiler-builtins/libm/src/math/arch/i586.rs
+++ b/library/compiler-builtins/libm/src/math/arch/i586.rs
@@ -60,3 +60,62 @@ pub fn floor(mut x: f64) -> f64 {
     }
     x
 }
+/// Implements the exponential functions with `x87` assembly.
+///
+/// This relies on the instruction `f2xm1`, which computes `2^x - 1` (for
+/// |x| < 1). This transcendental instruction is documented to produce results
+/// with error below 1ulp (in the native double-extended precision format). This
+/// translates to correctly rounded results for f32, but results in f64 may have
+/// 1ulp error, which may depend on the hardware.
+macro_rules! x87exp {
+    ($float_ty:ident, $word_size:literal, $fn_name:ident,  $load_op:literal) => {
+        pub fn $fn_name(mut x: $float_ty) -> $float_ty { unsafe {
+            core::arch::asm!(
+                // Prepare the register stack as
+                // ```
+                // st(0) = y = x*log2(base)
+                // st(1) = 1.0
+                // st(2) = round(y)
+                // ```
+                concat!($load_op, " ", $word_size, " ptr [{x}]"),
+                "fld1",
+                "fld st(1)",
+                "frndint",
+                "fxch st(2)",
+
+                // Compare y with round(y) to determine if y is finite and
+                // not an integer. If so, compute `exp2(y - round(y))` into
+                // st(1). Otherwise skip ahead with `st(1) = 1.0`
+                "fucom st(2)",
+                "fstsw ax",
+                "test ax, 0x4000",
+                "jnz 2f",
+                "fsub st(0), st(2)", // st(0) = y - round(y)
+                "f2xm1",             // st(0) = 2^st(0) - 1.0
+                "fadd st(1), st(0)", // st(1) = 1 + st(0) = exp2(y - round(y))
+                "2:",
+
+                // Finally, scale by `exp2(round(y))` and clear the stack.
+                "fstp st(0)",
+                "fscale",
+                concat!("fstp ", $word_size, " ptr [{x}]"),
+                "fstp st(0)",
+                x = in(reg) &mut x,
+                out("ax") _,
+                out("st(0)") _, out("st(1)") _,
+                out("st(2)") _, out("st(3)") _,
+                out("st(4)") _, out("st(5)") _,
+                out("st(6)") _, out("st(7)") _,
+                options(nostack),
+            );
+            x
+        }}
+    };
+}
+
+x87exp!(f32, "dword", x87_exp2f, "fld");
+x87exp!(f64, "qword", x87_exp2, "fld");
+x87exp!(f32, "dword", x87_exp10f, "fldl2t\nfmul");
+x87exp!(f64, "qword", x87_exp10, "fldl2t\nfmul");
+x87exp!(f32, "dword", x87_expf, "fldl2e\nfmul");
+x87exp!(f64, "qword", x87_exp, "fldl2e\nfmul");
diff --git a/library/compiler-builtins/libm/src/math/arch/mod.rs b/library/compiler-builtins/libm/src/math/arch/mod.rs
index 984ae7f3129f9..ba859c679d0db 100644
--- a/library/compiler-builtins/libm/src/math/arch/mod.rs
+++ b/library/compiler-builtins/libm/src/math/arch/mod.rs
@@ -48,3 +48,8 @@ cfg_if! {
         pub use i586::{ceil, floor};
     }
 }
+cfg_if! {
+    if #[cfg(x86_no_sse)] {
+        pub use i586::{x87_exp10f, x87_exp10, x87_expf, x87_exp, x87_exp2f, x87_exp2};
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/exp.rs b/library/compiler-builtins/libm/src/math/exp.rs
index 78ce5dd134ac3..cb939ad5d8bf2 100644
--- a/library/compiler-builtins/libm/src/math/exp.rs
+++ b/library/compiler-builtins/libm/src/math/exp.rs
@@ -83,6 +83,12 @@ const P5: f64 = 4.13813679705723846039e-08; /* 0x3E663769, 0x72BEA4D0 */
 /// (where *e* is the base of the natural system of logarithms, approximately 2.71828).
 #[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn exp(mut x: f64) -> f64 {
+    select_implementation! {
+        name: x87_exp,
+        use_arch_required: x86_no_sse,
+        args: x,
+    }
+
     let x1p1023 = f64::from_bits(0x7fe0000000000000); // 0x1p1023 === 2 ^ 1023
     let x1p_149 = f64::from_bits(0x36a0000000000000); // 0x1p-149 === 2 ^ -149
 
diff --git a/library/compiler-builtins/libm/src/math/exp10.rs b/library/compiler-builtins/libm/src/math/exp10.rs
index 1f49f5e96979c..e0af1945b922a 100644
--- a/library/compiler-builtins/libm/src/math/exp10.rs
+++ b/library/compiler-builtins/libm/src/math/exp10.rs
@@ -9,6 +9,12 @@ const P10: &[f64] = &[
 /// Calculates 10 raised to the power of `x` (f64).
 #[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn exp10(x: f64) -> f64 {
+    select_implementation! {
+        name: x87_exp10,
+        use_arch_required: x86_no_sse,
+        args: x,
+    }
+
     let (mut y, n) = modf(x);
     let u: u64 = n.to_bits();
     /* fabs(n) < 16 without raising invalid on nan */
diff --git a/library/compiler-builtins/libm/src/math/exp10f.rs b/library/compiler-builtins/libm/src/math/exp10f.rs
index 22a264211d03e..f0a311c2d1915 100644
--- a/library/compiler-builtins/libm/src/math/exp10f.rs
+++ b/library/compiler-builtins/libm/src/math/exp10f.rs
@@ -9,6 +9,12 @@ const P10: &[f32] = &[
 /// Calculates 10 raised to the power of `x` (f32).
 #[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn exp10f(x: f32) -> f32 {
+    select_implementation! {
+        name: x87_exp10f,
+        use_arch_required: x86_no_sse,
+        args: x,
+    }
+
     let (mut y, n) = modff(x);
     let u = n.to_bits();
     /* fabsf(n) < 8 without raising invalid on nan */
diff --git a/library/compiler-builtins/libm/src/math/exp2.rs b/library/compiler-builtins/libm/src/math/exp2.rs
index 6e4cbc29dcc99..08b71587f6de5 100644
--- a/library/compiler-builtins/libm/src/math/exp2.rs
+++ b/library/compiler-builtins/libm/src/math/exp2.rs
@@ -324,6 +324,12 @@ static TBL: [u64; TBLSIZE * 2] = [
 /// Calculate `2^x`, that is, 2 raised to the power `x`.
 #[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn exp2(mut x: f64) -> f64 {
+    select_implementation! {
+        name: x87_exp2,
+        use_arch_required: x86_no_sse,
+        args: x,
+    }
+
     let redux = f64::from_bits(0x4338000000000000) / TBLSIZE as f64;
     let p1 = f64::from_bits(0x3fe62e42fefa39ef);
     let p2 = f64::from_bits(0x3fcebfbdff82c575);
diff --git a/library/compiler-builtins/libm/src/math/exp2f.rs b/library/compiler-builtins/libm/src/math/exp2f.rs
index 733d2f1a84738..ceff6822c5969 100644
--- a/library/compiler-builtins/libm/src/math/exp2f.rs
+++ b/library/compiler-builtins/libm/src/math/exp2f.rs
@@ -75,6 +75,12 @@ static EXP2FT: [u64; TBLSIZE] = [
 /// Calculate `2^x`, that is, 2 raised to the power `x`.
 #[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn exp2f(mut x: f32) -> f32 {
+    select_implementation! {
+        name: x87_exp2f,
+        use_arch_required: x86_no_sse,
+        args: x,
+    }
+
     let redux = f32::from_bits(0x4b400000) / TBLSIZE as f32;
     let p1 = f32::from_bits(0x3f317218);
     let p2 = f32::from_bits(0x3e75fdf0);
diff --git a/library/compiler-builtins/libm/src/math/expf.rs b/library/compiler-builtins/libm/src/math/expf.rs
index dbbfdbba9253b..5541ab79a9c14 100644
--- a/library/compiler-builtins/libm/src/math/expf.rs
+++ b/library/compiler-builtins/libm/src/math/expf.rs
@@ -32,6 +32,12 @@ const P2: f32 = -2.7667332906e-3; /* -0xb55215.0p-32 */
 /// (where *e* is the base of the natural system of logarithms, approximately 2.71828).
 #[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn expf(mut x: f32) -> f32 {
+    select_implementation! {
+        name: x87_expf,
+        use_arch_required: x86_no_sse,
+        args: x,
+    }
+
     let x1p127 = f32::from_bits(0x7f000000); // 0x1p127f === 2 ^ 127
     let x1p_126 = f32::from_bits(0x800000); // 0x1p-126f === 2 ^ -126  /*original 0x1p-149f    ??????????? */
     let mut hx = x.to_bits();
diff --git a/library/compiler-builtins/libm/src/math/expm1f.rs b/library/compiler-builtins/libm/src/math/expm1f.rs
index f77515a4b99b3..388da3f30173d 100644
--- a/library/compiler-builtins/libm/src/math/expm1f.rs
+++ b/library/compiler-builtins/libm/src/math/expm1f.rs
@@ -13,7 +13,6 @@
  * ====================================================
  */
 
-const O_THRESHOLD: f32 = 8.8721679688e+01; /* 0x42b17180 */
 const LN2_HI: f32 = 6.9313812256e-01; /* 0x3f317180 */
 const LN2_LO: f32 = 9.0580006145e-06; /* 0x3717f7d1 */
 const INV_LN2: f32 = 1.4426950216e+00; /* 0x3fb8aa3b */
@@ -50,7 +49,8 @@ pub fn expm1f(mut x: f32) -> f32 {
         if sign {
             return -1.;
         }
-        if x > O_THRESHOLD {
+        if hx > 0x42b17217 {
+            /* x > log(FLT_MAX) */
             x *= x1p127;
             return x;
         }
diff --git a/library/compiler-builtins/libm/src/math/generic/fmod.rs b/library/compiler-builtins/libm/src/math/generic/fmod.rs
index 29acc8a4d5df5..3c3fd44b27cc2 100644
--- a/library/compiler-builtins/libm/src/math/generic/fmod.rs
+++ b/library/compiler-builtins/libm/src/math/generic/fmod.rs
@@ -1,8 +1,12 @@
 /* SPDX-License-Identifier: MIT OR Apache-2.0 */
-use crate::support::{CastFrom, Float, Int, MinInt};
+use crate::support::{CastFrom, CastInto, Float, HInt, Int, MinInt, NarrowingDiv};
 
 #[inline]
-pub fn fmod<F: Float>(x: F, y: F) -> F {
+pub fn fmod<F: Float>(x: F, y: F) -> F
+where
+    F::Int: HInt,
+    <F::Int as HInt>::D: NarrowingDiv,
+{
     let _1 = F::Int::ONE;
     let sx = x.to_bits() & F::SIGN_MASK;
     let ux = x.to_bits() & !F::SIGN_MASK;
@@ -29,7 +33,7 @@ pub fn fmod<F: Float>(x: F, y: F) -> F {
 
     // To compute `(num << ex) % (div << ey)`, first
     // evaluate `rem = (num << (ex - ey)) % div` ...
-    let rem = reduction(num, ex - ey, div);
+    let rem = reduction::<F>(num, ex - ey, div);
     // ... so the result will be `rem << ey`
 
     if rem.is_zero() {
@@ -58,11 +62,55 @@ fn into_sig_exp<F: Float>(mut bits: F::Int) -> (F::Int, u32) {
 }
 
 /// Compute the remainder `(x * 2.pow(e)) % y` without overflow.
-fn reduction<I: Int>(mut x: I, e: u32, y: I) -> I {
-    x %= y;
-    for _ in 0..e {
-        x <<= 1;
-        x = x.checked_sub(y).unwrap_or(x);
+fn reduction<F>(mut x: F::Int, e: u32, y: F::Int) -> F::Int
+where
+    F: Float,
+    F::Int: HInt,
+    <<F as Float>::Int as HInt>::D: NarrowingDiv,
+{
+    // `f16` only has 5 exponent bits, so even `f16::MAX = 65504.0` is only
+    // a 40-bit integer multiple of the smallest subnormal.
+    if F::BITS == 16 {
+        debug_assert!(F::EXP_MAX - F::EXP_MIN == 29);
+        debug_assert!(e <= 29);
+        let u: u16 = x.cast();
+        let v: u16 = y.cast();
+        let u = (u as u64) << e;
+        let v = v as u64;
+        return F::Int::cast_from((u % v) as u16);
     }
-    x
+
+    // Ensure `x < 2y` for later steps
+    if x >= (y << 1) {
+        // This case is only reached with subnormal divisors,
+        // but it might be better to just normalize all significands
+        // to make this unnecessary. The further calls could potentially
+        // benefit from assuming a specific fixed leading bit position.
+        x %= y;
+    }
+
+    // The simple implementation seems to be fastest for a short reduction
+    // at this size. The limit here was chosen empirically on an Intel Nehalem.
+    // Less old CPUs that have faster `u64 * u64 -> u128` might not benefit,
+    // and 32-bit systems or architectures without hardware multipliers might
+    // want to do this in more cases.
+    if F::BITS == 64 && e < 32 {
+        // Assumes `x < 2y`
+        for _ in 0..e {
+            x = x.checked_sub(y).unwrap_or(x);
+            x <<= 1;
+        }
+        return x.checked_sub(y).unwrap_or(x);
+    }
+
+    // Fast path for short reductions
+    if e < F::BITS {
+        let w = x.widen() << e;
+        if let Some((_, r)) = w.checked_narrowing_div_rem(y) {
+            return r;
+        }
+    }
+
+    // Assumes `x < 2y`
+    crate::support::linear_mul_reduction(x, e, y)
 }
diff --git a/library/compiler-builtins/libm/src/math/generic/scalbn.rs b/library/compiler-builtins/libm/src/math/generic/scalbn.rs
index 6dd9b1a9b84a4..68de41757913a 100644
--- a/library/compiler-builtins/libm/src/math/generic/scalbn.rs
+++ b/library/compiler-builtins/libm/src/math/generic/scalbn.rs
@@ -96,14 +96,14 @@ where
             // Work aroudn this by using a different algorithm that calculates the prescale
             // dynamically based on the maximum possible value. This adds more operations per round
             // since it needs to construct the scale, but works better in the general case.
-            let add = -(n + sig_total_bits as i32).clamp(exp_min, sig_total_bits as i32);
+            let add = -(n + sig_total_bits as i32).max(exp_min);
             let mul = F::from_parts(false, (F::EXP_BIAS as i32 - add) as u32, zero);
 
             x *= mul;
             n += add;
 
             if n < exp_min {
-                let add = -(n + sig_total_bits as i32).clamp(exp_min, sig_total_bits as i32);
+                let add = -(n + sig_total_bits as i32).max(exp_min);
                 let mul = F::from_parts(false, (F::EXP_BIAS as i32 - add) as u32, zero);
 
                 x *= mul;
diff --git a/library/compiler-builtins/libm/src/math/support/int_traits.rs b/library/compiler-builtins/libm/src/math/support/int_traits.rs
index f1aa1e5b9b4d2..55b609affd2e6 100644
--- a/library/compiler-builtins/libm/src/math/support/int_traits.rs
+++ b/library/compiler-builtins/libm/src/math/support/int_traits.rs
@@ -296,7 +296,14 @@ int_impl!(i128, u128);
 
 /// Trait for integers twice the bit width of another integer. This is implemented for all
 /// primitives except for `u8`, because there is not a smaller primitive.
-pub trait DInt: MinInt {
+pub trait DInt:
+    MinInt
+    + ops::Add<Output = Self>
+    + ops::Sub<Output = Self>
+    + ops::Shl<u32, Output = Self>
+    + ops::Shr<u32, Output = Self>
+    + Ord
+{
     /// Integer that is half the bit width of the integer this trait is implemented for
     type H: HInt<D = Self>;
 
diff --git a/library/compiler-builtins/libm/src/math/support/int_traits/narrowing_div.rs b/library/compiler-builtins/libm/src/math/support/int_traits/narrowing_div.rs
index 3da0843cc5408..e76fc5ae9f4ca 100644
--- a/library/compiler-builtins/libm/src/math/support/int_traits/narrowing_div.rs
+++ b/library/compiler-builtins/libm/src/math/support/int_traits/narrowing_div.rs
@@ -7,7 +7,6 @@ use crate::support::{CastInto, DInt, HInt, Int, MinInt, u256};
 /// This is the inverse of widening multiplication:
 ///  - for any `x` and nonzero `y`: `x.widen_mul(y).checked_narrowing_div_rem(y) == Some((x, 0))`,
 ///  - and for any `r in 0..y`: `x.carrying_mul(y, r).checked_narrowing_div_rem(y) == Some((x, r))`,
-#[allow(dead_code)]
 pub trait NarrowingDiv: DInt + MinInt<Unsigned = Self> {
     /// Computes `(self / n, self % n))`
     ///
diff --git a/library/compiler-builtins/libm/src/math/support/mod.rs b/library/compiler-builtins/libm/src/math/support/mod.rs
index 7b529eb760b73..15ab010dc8d5f 100644
--- a/library/compiler-builtins/libm/src/math/support/mod.rs
+++ b/library/compiler-builtins/libm/src/math/support/mod.rs
@@ -8,6 +8,7 @@ pub(crate) mod feature_detect;
 mod float_traits;
 pub mod hex_float;
 mod int_traits;
+mod modular;
 
 #[allow(unused_imports)]
 pub use big::{i256, u256};
@@ -28,8 +29,8 @@ pub use hex_float::hf16;
 pub use hex_float::hf128;
 #[allow(unused_imports)]
 pub use hex_float::{hf32, hf64};
-#[allow(unused_imports)]
 pub use int_traits::{CastFrom, CastInto, DInt, HInt, Int, MinInt, NarrowingDiv};
+pub use modular::linear_mul_reduction;
 
 /// Hint to the compiler that the current path is cold.
 pub fn cold_path() {
diff --git a/library/compiler-builtins/libm/src/math/support/modular.rs b/library/compiler-builtins/libm/src/math/support/modular.rs
new file mode 100644
index 0000000000000..cc0edf2f2bc04
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/support/modular.rs
@@ -0,0 +1,304 @@
+/* SPDX-License-Identifier: MIT OR Apache-2.0 */
+
+//! This module provides accelerated modular multiplication by large powers
+//! of two, which is needed for computing floating point remainders in `fmod`
+//! and similar functions.
+//!
+//! To keep the equations somewhat concise, the following conventions are used:
+//!  - all integer operations are in the mathematical sense, without overflow
+//!  - concatenation means multiplication: `2xq = 2 * x * q`
+//!  - `R = (1 << U::BITS)` is the modulus of wrapping arithmetic in `U`
+
+use crate::support::int_traits::NarrowingDiv;
+use crate::support::{DInt, HInt, Int};
+
+/// Compute the remainder `(x << e) % y` with unbounded integers.
+/// Requires `x < 2y` and `y.leading_zeros() >= 2`
+pub fn linear_mul_reduction<U>(x: U, mut e: u32, mut y: U) -> U
+where
+    U: HInt + Int<Unsigned = U>,
+    U::D: NarrowingDiv,
+{
+    assert!(y <= U::MAX >> 2);
+    assert!(x < (y << 1));
+    let _0 = U::ZERO;
+    let _1 = U::ONE;
+
+    // power of two divisors
+    if (y & (y - _1)).is_zero() {
+        if e < U::BITS {
+            // shift and only keep low bits
+            return (x << e) & (y - _1);
+        } else {
+            // would shift out all the bits
+            return _0;
+        }
+    }
+
+    // Use the identity `(x << e) % y == ((x << (e + s)) % (y << s)) >> s`
+    // to shift the divisor so it has exactly two leading zeros to satisfy
+    // the precondition of `Reducer::new`
+    let s = y.leading_zeros() - 2;
+    e += s;
+    y <<= s;
+
+    // `m: Reducer` keeps track of the remainder `x` in a form that makes it
+    //  very efficient to do `x <<= k` modulo `y` for integers `k < U::BITS`
+    let mut m = Reducer::new(x, y);
+
+    // Use the faster special case with constant `k == U::BITS - 1` while we can
+    while e >= U::BITS - 1 {
+        m.word_reduce();
+        e -= U::BITS - 1;
+    }
+    // Finish with the variable shift operation
+    m.shift_reduce(e);
+
+    // The partial remainder is in `[0, 2y)` ...
+    let r = m.partial_remainder();
+    // ... so check and correct, and compensate for the earlier shift.
+    r.checked_sub(y).unwrap_or(r) >> s
+}
+
+/// Helper type for computing the reductions. The implementation has a number
+/// of seemingly weird choices, but everything is aimed at streamlining
+/// `Reducer::word_reduce` into its current form.
+///
+/// Implicitly contains:
+///  n in (R/8, R/4)
+///  x in [0, 2n)
+/// The value of `n` is fixed for a given `Reducer`,
+/// but the value of `x` is modified by the methods.
+#[derive(Debug, Clone, PartialEq, Eq)]
+struct Reducer<U: HInt> {
+    // m = 2n
+    m: U,
+    // q = (RR/2) / m
+    // r = (RR/2) % m
+    // Then RR/2 = qm + r, where `0 <= r < m`
+    // The value `q` is only needed during construction, so isn't saved.
+    r: U,
+    // The value `x` is implicitly stored as `2 * q * x`:
+    _2xq: U::D,
+}
+
+impl<U> Reducer<U>
+where
+    U: HInt,
+    U: Int<Unsigned = U>,
+{
+    /// Construct a reducer for `(x << _) mod n`.
+    ///
+    /// Requires `R/8 < n < R/4` and `x < 2n`.
+    fn new(x: U, n: U) -> Self
+    where
+        U::D: NarrowingDiv,
+    {
+        let _1 = U::ONE;
+        assert!(n > (_1 << (U::BITS - 3)));
+        assert!(n < (_1 << (U::BITS - 2)));
+        let m = n << 1;
+        assert!(x < m);
+
+        // We need to compute the parameters
+        // `q = (RR/2) / m`
+        // `r = (RR/2) % m`
+
+        // Since `m` is in `(R/4, R/2)`, the quotient `q` is in `[R, 2R)`, and
+        // it would overflow in `U` if computed directly. Instead, we compute
+        // `f = q - R`, which is in `[0, R)`. To do so, we simply subtract `Rm`
+        // from the dividend, which doesn't change the remainder:
+        // `f = R(R/2 - m) / m`
+        // `r = R(R/2 - m) % m`
+        let dividend = ((_1 << (U::BITS - 1)) - m).widen_hi();
+        let (f, r) = dividend.checked_narrowing_div_rem(m).unwrap();
+
+        // As `x < m`, `xq < qm <= RR/2`
+        // Thus `2xq = 2xR + 2xf` does not overflow in `U::D`.
+        let _2x = x + x;
+        let _2xq = _2x.widen_hi() + _2x.widen_mul(f);
+        Self { m, r, _2xq }
+    }
+
+    /// Extract the current remainder `x` in the range `[0, 2n)`
+    fn partial_remainder(&self) -> U {
+        // `RR/2 = qm + r`, where `0 <= r < m`
+        // `2xq = uR + v`,  where `0 <= v < R`
+
+        // The goal is to extract the current value of `x` from the value `2xq`
+        // that we actually have. A bit simplified, we could multiply it by `m`
+        // to obtain `2xqm == 2x(RR/2 - r) == xRR - 2xr`, where `2xr < RR`.
+        // We could just round that up to the next multiple of `RR` to get `x`,
+        // but we can avoid having to multiply the full double-wide `2xq` by
+        // making a couple of adjustments:
+
+        // First, let's only use the high half `u` for the product, and
+        // include an additional error term due to the truncation:
+        //  `mu = xR - (2xr + mv)/R`
+
+        // Next, show bounds for the error term
+        //  `0 <= mv < mR` follows from `0 <= v < R`
+        //  `0 <= 2xr < mR` follows from `0 <= x < m < R/2` and `0 <= r < m`
+        // Adding those together, we have:
+        //  `0 <= (mv + 2xr)/R < 2m`
+        // Which also implies:
+        //  `0 < 2m - (mv + 2xr)/R <= 2m < R`
+
+        // For that reason, we can use `u + 2` as the factor to obtain
+        //  `m(u + 2) = xR + (2m - (mv + 2xr)/R)`
+        // By the previous inequality, the second term fits neatly in the lower
+        // half, so we get exactly `x` as the high half.
+        let u = self._2xq.hi();
+        let _2 = U::ONE + U::ONE;
+        self.m.widen_mul(u + _2).hi()
+
+        // Additionally, we should ensure that `u + 2` cannot overflow:
+        // Since `x < m` and `2qm <= RR`,
+        //  `2xq <= 2q(m-1) <= RR - 2q`
+        // As we also have `q > R`,
+        //  `2xq < RR - 2R`
+        // which is sufficient.
+    }
+
+    /// Replace the remainder `x` with `(x << k) - un`,
+    /// for a suitable quotient `u`, which is returned.
+    ///
+    /// Requires that `k < U::BITS`.
+    fn shift_reduce(&mut self, k: u32) -> U {
+        assert!(k < U::BITS);
+
+        // First, split the shifted value:
+        // `2xq << k = aRR/2 + b`, where `0 <= b < RR/2`
+        let a = self._2xq.hi() >> (U::BITS - 1 - k);
+        let (low, high) = (self._2xq << k).lo_hi();
+        let b = U::D::from_lo_hi(low, high & (U::MAX >> 1));
+
+        // Then, subtract `2anq = aqm`:
+        // ```
+        // (2xq << k) - aqm
+        // = aRR/2 + b - aqm
+        // = a(RR/2 - qm) + b
+        // = ar + b
+        // ```
+        self._2xq = a.widen_mul(self.r) + b;
+        a
+
+        // Since `a` is at most the high half of `2xq`, we have
+        //  `a + 2 < R` (shown above, in `partial_remainder`)
+        // Using that together with `b < RR/2` and `r < m < R/2`,
+        // we get `(a + 2)r + b < RR`, so
+        //  `ar + b < RR - 2r = 2mq`
+        // which shows that the new remainder still satisfies `x < m`.
+    }
+
+    // NB: `word_reduce()` is just the special case `shift_reduce(U::BITS - 1)`
+    // that optimizes especially well. The correspondence is that `a == u` and
+    //  `b == (v >> 1).widen_hi()`
+    //
+    /// Replace the remainder `x` with `x(R/2) - un`,
+    /// for a suitable quotient `u`, which is returned.
+    fn word_reduce(&mut self) -> U {
+        // To do so, we replace `2xq = uR + v` with
+        // ```
+        // 2 * (x(R/2) - un) * q
+        // = xqR - 2unq
+        // = xqR - uqm
+        // = uRR/2 + vR/2 - uRR/2 + ur
+        // = ur + (v/2)R
+        // ```
+        let (v, u) = self._2xq.lo_hi();
+        self._2xq = u.widen_mul(self.r) + U::widen_hi(v >> 1);
+        u
+
+        // Additional notes:
+        //  1. As `v` is the low bits of `2xq`, it is even and can be halved.
+        //  2. The new remainder is `(xr + mv/2) / R` (see below)
+        //      and since `v < R`, `r < m`, `x < m < R/2`,
+        //      that is also strictly less than `m`.
+        // ```
+        // (x(R/2) - un)R
+        //      = xRR/2 - (m/2)uR
+        //      = x(qm + r) - (m/2)(2xq - v)
+        //      = xqm + xr - xqm + mv/2
+        //      = xr + mv/2
+        // ```
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use crate::support::linear_mul_reduction;
+    use crate::support::modular::Reducer;
+
+    #[test]
+    fn reducer_ops() {
+        for n in 33..=63_u8 {
+            for x in 0..2 * n {
+                let temp = Reducer::new(x, n);
+                let n = n as u32;
+                let x0 = temp.partial_remainder() as u32;
+                assert_eq!(x as u32, x0);
+                for k in 0..=7 {
+                    let mut red = temp.clone();
+                    let u = red.shift_reduce(k) as u32;
+                    let x1 = red.partial_remainder() as u32;
+                    assert_eq!(x1, (x0 << k) - u * n);
+                    assert!(x1 < 2 * n);
+                    assert!((red._2xq as u32).is_multiple_of(2 * x1));
+
+                    // `word_reduce` is equivalent to
+                    // `shift_reduce(U::BITS - 1)`
+                    if k == 7 {
+                        let mut alt = temp.clone();
+                        let w = alt.word_reduce();
+                        assert_eq!(u, w as u32);
+                        assert_eq!(alt, red);
+                    }
+                }
+            }
+        }
+    }
+    #[test]
+    fn reduction_u8() {
+        for y in 1..64u8 {
+            for x in 0..2 * y {
+                let mut r = x % y;
+                for e in 0..100 {
+                    assert_eq!(r, linear_mul_reduction(x, e, y));
+                    // maintain the correct expected remainder
+                    r <<= 1;
+                    if r >= y {
+                        r -= y;
+                    }
+                }
+            }
+        }
+    }
+    #[test]
+    fn reduction_u128() {
+        assert_eq!(
+            linear_mul_reduction::<u128>(17, 100, 123456789),
+            (17 << 100) % 123456789
+        );
+
+        // power-of-two divisor
+        assert_eq!(
+            linear_mul_reduction(0xdead_beef, 100, 1_u128 << 116),
+            0xbeef << 100
+        );
+
+        let x = 10_u128.pow(37);
+        let y = 11_u128.pow(36);
+        assert!(x < y);
+        let mut r = x;
+        for e in 0..1000 {
+            assert_eq!(r, linear_mul_reduction(x, e, y));
+            // maintain the correct expected remainder
+            r <<= 1;
+            if r >= y {
+                r -= y;
+            }
+            assert!(r != 0);
+        }
+    }
+}
diff --git a/library/compiler-builtins/rust-version b/library/compiler-builtins/rust-version
index 71fbbbaa984f6..7345c25066a82 100644
--- a/library/compiler-builtins/rust-version
+++ b/library/compiler-builtins/rust-version
@@ -1 +1 @@
-47cd7120d9b4e1b64eb27c87522a07888197fae8
+2dc30247c5d8293aaa31e1d7dae2ed2fde908ada
diff --git a/library/compiler-builtins/triagebot.toml b/library/compiler-builtins/triagebot.toml
index eba5cdd88b941..b210a5fb52563 100644
--- a/library/compiler-builtins/triagebot.toml
+++ b/library/compiler-builtins/triagebot.toml
@@ -12,10 +12,11 @@ exclude_titles = ["Rustc pull update"]
 [issue-links]
 check-commits = false
 
-# Prevents mentions in commits to avoid users being spammed
-# Documentation at: https://forge.rust-lang.org/triagebot/no-mentions.html
-[no-mentions]
-
 # Enable issue transfers within the org
 # Documentation at: https://forge.rust-lang.org/triagebot/transfer.html
 [transfer]
+
+# Enable comments linking to triagebot range-diff when a PR is rebased
+# onto a different base commit
+# Documentation at: https://forge.rust-lang.org/triagebot/range-diff.html
+[range-diff]