diff --git a/library/compiler-builtins/.github/workflows/main.yaml b/library/compiler-builtins/.github/workflows/main.yaml index c8faecfcb2cc7..38995cf0f0ff0 100644 --- a/library/compiler-builtins/.github/workflows/main.yaml +++ b/library/compiler-builtins/.github/workflows/main.yaml @@ -1,6 +1,6 @@ name: CI on: - push: { branches: [master] } + push: { branches: [main] } pull_request: concurrency: @@ -89,7 +89,7 @@ jobs: - target: x86_64-unknown-linux-gnu os: ubuntu-24.04 - target: x86_64-apple-darwin - os: macos-13 + os: macos-15-intel - target: i686-pc-windows-msvc os: windows-2025 - target: x86_64-pc-windows-msvc @@ -239,6 +239,8 @@ jobs: include: - target: x86_64-unknown-linux-gnu os: ubuntu-24.04 + - target: aarch64-unknown-linux-gnu + os: ubuntu-24.04-arm runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@master @@ -247,13 +249,13 @@ jobs: - name: Set up dependencies run: | sudo apt-get update - sudo apt-get install -y valgrind gdb libc6-dbg # Needed for iai-callgrind + sudo apt-get install -y valgrind gdb libc6-dbg # Needed for gungraun rustup update "$BENCHMARK_RUSTC" --no-self-update rustup default "$BENCHMARK_RUSTC" - # Install the version of iai-callgrind-runner that is specified in Cargo.toml - iai_version="$(cargo metadata --format-version=1 --features icount | - jq -r '.packages[] | select(.name == "iai-callgrind").version')" - cargo binstall -y iai-callgrind-runner --version "$iai_version" + # Install the version of gungraun-runner that is specified in Cargo.toml + gungraun_version="$(cargo metadata --format-version=1 --features icount | + jq -r '.packages[] | select(.name == "gungraun").version')" + cargo binstall -y gungraun-runner --version "$gungraun_version" sudo apt-get install valgrind - uses: Swatinem/rust-cache@v2 with: diff --git a/library/compiler-builtins/.github/workflows/publish.yaml b/library/compiler-builtins/.github/workflows/publish.yaml index 85a33c039d2a1..d6f1dc398e8ec 100644 --- a/library/compiler-builtins/.github/workflows/publish.yaml +++ b/library/compiler-builtins/.github/workflows/publish.yaml @@ -5,7 +5,7 @@ permissions: contents: write on: - push: { branches: [master] } + push: { branches: [main] } jobs: release-plz: diff --git a/library/compiler-builtins/.github/workflows/rustc-pull.yml b/library/compiler-builtins/.github/workflows/rustc-pull.yml index ad7693e17b0ee..617db14f46eea 100644 --- a/library/compiler-builtins/.github/workflows/rustc-pull.yml +++ b/library/compiler-builtins/.github/workflows/rustc-pull.yml @@ -17,7 +17,7 @@ jobs: zulip-stream-id: 219381 zulip-topic: 'compiler-builtins subtree sync automation' zulip-bot-email: "compiler-builtins-ci-bot@rust-lang.zulipchat.com" - pr-base-branch: master + pr-base-branch: main branch-name: rustc-pull secrets: zulip-api-token: ${{ secrets.ZULIP_API_TOKEN }} diff --git a/library/compiler-builtins/.gitignore b/library/compiler-builtins/.gitignore index f12b871c2f783..abe346659d4c7 100644 --- a/library/compiler-builtins/.gitignore +++ b/library/compiler-builtins/.gitignore @@ -9,6 +9,7 @@ compiler-rt # Benchmark cache baseline-* iai-home +gungraun-home # Temporary files *.bk diff --git a/library/compiler-builtins/CONTRIBUTING.md b/library/compiler-builtins/CONTRIBUTING.md index 9ae4f893c60d1..f74d3f8ba1276 100644 --- a/library/compiler-builtins/CONTRIBUTING.md +++ b/library/compiler-builtins/CONTRIBUTING.md @@ -150,8 +150,8 @@ cargo bench --no-default-features \ ``` There are also benchmarks that check instruction count behind the `icount` -feature. These require [`iai-callgrind-runner`] (via Cargo) and [Valgrind] -to be installed, which means these only run on limited platforms. +feature. These require [`gungraun-runner`] (via Cargo) and [Valgrind] to be +installed, which means these only run on limited platforms. Instruction count benchmarks are run as part of CI to flag performance regresions. @@ -163,7 +163,7 @@ cargo bench --no-default-features \ --bench icount --bench mem_icount ``` -[`iai-callgrind-runner`]: https://crates.io/crates/iai-callgrind-runner +[`gungraun-runner`]: https://crates.io/crates/gungraun-runner [Valgrind]: https://valgrind.org/ ## Subtree synchronization diff --git a/library/compiler-builtins/Cargo.toml b/library/compiler-builtins/Cargo.toml index 956d738f3b1f1..8501f4e630b55 100644 --- a/library/compiler-builtins/Cargo.toml +++ b/library/compiler-builtins/Cargo.toml @@ -51,5 +51,6 @@ codegen-units = 1 lto = "fat" [profile.bench] -# Required for iai-callgrind +# Required for gungraun debug = true +strip = false diff --git a/library/compiler-builtins/PUBLISHING.md b/library/compiler-builtins/PUBLISHING.md index 3df682ab04a4b..c521910641f55 100644 --- a/library/compiler-builtins/PUBLISHING.md +++ b/library/compiler-builtins/PUBLISHING.md @@ -5,7 +5,7 @@ It's not great, but it works for now. PRs to improve this process would be greatly appreciated! 1. Make sure you've got a clean working tree and it's updated with the latest - changes on `master` + changes on `main` 2. Edit `Cargo.toml` to bump the version number 3. Commit this change 4. Run `git tag` to create a tag for this version diff --git a/library/compiler-builtins/builtins-shim/Cargo.toml b/library/compiler-builtins/builtins-shim/Cargo.toml index 707ebdbc77b27..ac77224f5ce1e 100644 --- a/library/compiler-builtins/builtins-shim/Cargo.toml +++ b/library/compiler-builtins/builtins-shim/Cargo.toml @@ -11,7 +11,12 @@ [package] name = "compiler_builtins" version = "0.1.160" -authors = ["Jorge Aparicio "] +authors = [ + "Alex Crichton ", + "Amanieu d'Antras ", + "Jorge Aparicio ", + "Trevor Gross ", +] description = "Compiler intrinsics used by the Rust compiler." repository = "https://github.com/rust-lang/compiler-builtins" license = "MIT AND Apache-2.0 WITH LLVM-exception AND (MIT OR Apache-2.0)" diff --git a/library/compiler-builtins/builtins-test/Cargo.toml b/library/compiler-builtins/builtins-test/Cargo.toml index 00a9d8579d119..9346ea65420b2 100644 --- a/library/compiler-builtins/builtins-test/Cargo.toml +++ b/library/compiler-builtins/builtins-test/Cargo.toml @@ -1,7 +1,6 @@ [package] name = "builtins-test" version = "0.1.0" -authors = ["Alex Crichton "] edition = "2024" publish = false license = "MIT AND Apache-2.0 WITH LLVM-exception AND (MIT OR Apache-2.0)" @@ -14,7 +13,7 @@ rand_xoshiro = "0.7" # To compare float builtins against rustc_apfloat = "0.2.3" # Really a dev dependency, but dev dependencies can't be optional -iai-callgrind = { version = "0.15.2", optional = true } +gungraun = { version = "0.17.0", optional = true } [dependencies.compiler_builtins] path = "../builtins-shim" @@ -46,8 +45,8 @@ no-sys-f16-f64-convert = [] # Skip tests that rely on f16 symbols being available on the system no-sys-f16 = ["no-sys-f16-f64-convert"] -# Enable icount benchmarks (requires iai-callgrind and valgrind) -icount = ["dep:iai-callgrind"] +# Enable icount benchmarks (requires gungraun-runner and valgrind locally) +icount = ["dep:gungraun"] # Enable report generation without bringing in more dependencies by default benchmarking-reports = ["criterion/plotters", "criterion/html_reports"] diff --git a/library/compiler-builtins/builtins-test/benches/mem_icount.rs b/library/compiler-builtins/builtins-test/benches/mem_icount.rs index bd88cf80c7de2..37595e8258436 100644 --- a/library/compiler-builtins/builtins-test/benches/mem_icount.rs +++ b/library/compiler-builtins/builtins-test/benches/mem_icount.rs @@ -1,11 +1,11 @@ -//! Benchmarks that use Callgrind (via `iai_callgrind`) to report instruction count metrics. This +//! Benchmarks that use Callgrind (via `gungraun`) to report instruction count metrics. This //! is stable enough to be tested in CI. use std::hint::black_box; use std::{ops, slice}; use compiler_builtins::mem::{memcmp, memcpy, memmove, memset}; -use iai_callgrind::{library_benchmark, library_benchmark_group, main}; +use gungraun::{library_benchmark, library_benchmark_group, main}; const PAGE_SIZE: usize = 0x1000; // 4 kiB const MAX_ALIGN: usize = 512; // assume we may use avx512 operations one day @@ -108,7 +108,7 @@ mod mcpy { ], setup = setup, )] - fn bench((len, mut dst, src): (usize, AlignedSlice, AlignedSlice)) { + fn bench_cpy((len, mut dst, src): (usize, AlignedSlice, AlignedSlice)) { unsafe { black_box(memcpy( black_box(dst.as_mut_ptr()), @@ -118,7 +118,7 @@ mod mcpy { } } - library_benchmark_group!(name = memcpy; benchmarks = bench); + library_benchmark_group!(name = memcpy; benchmarks = bench_cpy); } mod mset { @@ -157,7 +157,7 @@ mod mset { ], setup = setup, )] - fn bench((len, mut dst): (usize, AlignedSlice)) { + fn bench_set((len, mut dst): (usize, AlignedSlice)) { unsafe { black_box(memset( black_box(dst.as_mut_ptr()), @@ -167,7 +167,7 @@ mod mset { } } - library_benchmark_group!(name = memset; benchmarks = bench); + library_benchmark_group!(name = memset; benchmarks = bench_set); } mod mcmp { @@ -225,7 +225,7 @@ mod mcmp { ], setup = setup )] - fn bench((len, mut dst, src): (usize, AlignedSlice, AlignedSlice)) { + fn bench_cmp((len, mut dst, src): (usize, AlignedSlice, AlignedSlice)) { unsafe { black_box(memcmp( black_box(dst.as_mut_ptr()), @@ -235,7 +235,7 @@ mod mcmp { } } - library_benchmark_group!(name = memcmp; benchmarks = bench); + library_benchmark_group!(name = memcmp; benchmarks = bench_cmp); } mod mmove { @@ -384,7 +384,7 @@ mod mmove { ], setup = setup_forward )] - fn forward((len, spread, mut buf): (usize, usize, AlignedSlice)) { + fn forward_move((len, spread, mut buf): (usize, usize, AlignedSlice)) { // Test moving from the start of the buffer toward the end unsafe { black_box(memmove( @@ -478,7 +478,7 @@ mod mmove { ], setup = setup_backward )] - fn backward((len, spread, mut buf): (usize, usize, AlignedSlice)) { + fn backward_move((len, spread, mut buf): (usize, usize, AlignedSlice)) { // Test moving from the end of the buffer toward the start unsafe { black_box(memmove( @@ -489,7 +489,7 @@ mod mmove { } } - library_benchmark_group!(name = memmove; benchmarks = forward, backward); + library_benchmark_group!(name = memmove; benchmarks = forward_move, backward_move); } use mcmp::memcmp; diff --git a/library/compiler-builtins/builtins-test/tests/lse.rs b/library/compiler-builtins/builtins-test/tests/lse.rs index 5d59fbb7f44d2..56891be8a8ac1 100644 --- a/library/compiler-builtins/builtins-test/tests/lse.rs +++ b/library/compiler-builtins/builtins-test/tests/lse.rs @@ -19,7 +19,11 @@ mod cas { let mut target = expected.wrapping_add(10); assert_eq!( unsafe { - compiler_builtins::aarch64_linux::$name::$name(expected, new, &mut target) + compiler_builtins::aarch64_outline_atomics::$name::$name( + expected, + new, + &mut target, + ) }, expected.wrapping_add(10), "return value should always be the previous value", @@ -33,7 +37,11 @@ mod cas { target = expected; assert_eq!( unsafe { - compiler_builtins::aarch64_linux::$name::$name(expected, new, &mut target) + compiler_builtins::aarch64_outline_atomics::$name::$name( + expected, + new, + &mut target, + ) }, expected ); @@ -54,7 +62,9 @@ mod swap { builtins_test::fuzz_2(10000, |left: super::int_ty!($bytes), mut right| { let orig_right = right; assert_eq!( - unsafe { compiler_builtins::aarch64_linux::$name::$name(left, &mut right) }, + unsafe { + compiler_builtins::aarch64_outline_atomics::$name::$name(left, &mut right) + }, orig_right ); assert_eq!(left, right); @@ -74,7 +84,7 @@ macro_rules! test_op { let mut target = old; let op: fn(super::int_ty!($bytes), super::int_ty!($bytes)) -> _ = $($op)*; let expected = op(old, val); - assert_eq!(old, unsafe { compiler_builtins::aarch64_linux::$name::$name(val, &mut target) }, "{} should return original value", stringify!($name)); + assert_eq!(old, unsafe { compiler_builtins::aarch64_outline_atomics::$name::$name(val, &mut target) }, "{} should return original value", stringify!($name)); assert_eq!(expected, target, "{} should store to target", stringify!($name)); }); } diff --git a/library/compiler-builtins/ci/bench-icount.sh b/library/compiler-builtins/ci/bench-icount.sh index 12228b9da971b..6d92b50a6dae7 100755 --- a/library/compiler-builtins/ci/bench-icount.sh +++ b/library/compiler-builtins/ci/bench-icount.sh @@ -10,35 +10,43 @@ if [ -z "$target" ]; then target="$host_target" fi -iai_home="iai-home" +# Print machine information +uname -a +lscpu || true + +gungraun_home="gungraun-home" # Use the arch as a tag to disambiguate artifacts tag="$(echo "$target" | cut -d'-' -f1)" -# Download the baseline from master +# Download the baseline from main ./ci/ci-util.py locate-baseline --download --extract --tag "$tag" +# FIXME: migration from iai-named baselines to gungraun, can be dropped +# after the first run with gungraun. +[ -d "iai-home" ] && mv "iai-home" "$gungraun_home" + # Run benchmarks once function run_icount_benchmarks() { cargo_args=( - "--bench" "icount" + "--bench" "*icount*" "--no-default-features" "--features" "unstable,unstable-float,icount" ) - iai_args=( - "--home" "$(pwd)/$iai_home" - "--callgrind-limits=ir=5.0" + gungraun_args=( + "--home" "$(pwd)/$gungraun_home" + "--callgrind-limits=ir=5.0%" "--save-summary" ) - # Parse `cargo_arg0 cargo_arg1 -- iai_arg0 iai_arg1` syntax - parsing_iai_args=0 + # Parse `cargo_arg0 cargo_arg1 -- gungraun_arg0 gungraun_arg1` syntax + parsing_gungraun_args=0 while [ "$#" -gt 0 ]; do - if [ "$parsing_iai_args" == "1" ]; then - iai_args+=("$1") + if [ "$parsing_gungraun_args" == "1" ]; then + gungraun_args+=("$1") elif [ "$1" == "--" ]; then - parsing_iai_args=1 + parsing_gungraun_args=1 else cargo_args+=("$1") fi @@ -46,9 +54,9 @@ function run_icount_benchmarks() { shift done - # Run iai-callgrind benchmarks. Do this in a subshell with `&& true` to - # capture rather than exit on error. - (cargo bench "${cargo_args[@]}" -- "${iai_args[@]}") && true + # Run gungraun benchmarks. Do this in a subshell with `&& true` to capture + # rather than exit on error. + (cargo bench "${cargo_args[@]}" -- "${gungraun_args[@]}") && true exit_code="$?" if [ "$exit_code" -eq 0 ]; then @@ -68,4 +76,4 @@ run_icount_benchmarks -- --save-baseline=hardfloat # Name and tar the new baseline name="baseline-icount-$tag-$(date -u +'%Y%m%d%H%M')-${GITHUB_SHA:0:12}" echo "BASELINE_NAME=$name" >>"$GITHUB_ENV" -tar cJf "$name.tar.xz" "$iai_home" +tar cJf "$name.tar.xz" "$gungraun_home" diff --git a/library/compiler-builtins/ci/ci-util.py b/library/compiler-builtins/ci/ci-util.py index c1db17c6c9010..ef9ce455178ec 100755 --- a/library/compiler-builtins/ci/ci-util.py +++ b/library/compiler-builtins/ci/ci-util.py @@ -38,7 +38,7 @@ `--tag` can be specified to look for artifacts with a specific tag, such as for a specific architecture. - Note that `--extract` will overwrite files in `iai-home`. + Note that `--extract` will overwrite files in `gungraun-home`. handle-bench-regressions PR_NUMBER Exit with success if the pull request contains a line starting with @@ -49,7 +49,7 @@ REPO_ROOT = Path(__file__).parent.parent GIT = ["git", "-C", REPO_ROOT] -DEFAULT_BRANCH = "master" +DEFAULT_BRANCH = "main" WORKFLOW_NAME = "CI" # Workflow that generates the benchmark artifacts ARTIFACT_PREFIX = "baseline-icount*" @@ -186,7 +186,7 @@ def __init__(self) -> None: def _init_change_list(self): """Create a list of files that have been changed. This uses GITHUB_REF if - available, otherwise a diff between `HEAD` and `master`. + available, otherwise a diff between `HEAD` and `main`. """ # For pull requests, GitHub creates a ref `refs/pull/1234/merge` (1234 being @@ -390,6 +390,7 @@ def locate_baseline(flags: list[str]) -> None: artifact_glob = f"{ARTIFACT_PREFIX}{f"-{tag}" if tag else ""}*" + # Skip checking because this will fail if the file already exists, which is fine. sp.run( ["gh", "run", "download", str(job_id), f"--pattern={artifact_glob}"], check=False, @@ -409,7 +410,17 @@ def locate_baseline(flags: list[str]) -> None: candidate_baselines.sort(reverse=True) baseline_archive = candidate_baselines[0] eprint(f"extracting {baseline_archive}") - sp.run(["tar", "xJvf", baseline_archive], check=True) + + all_paths = sp.check_output(["tar", "tJf", baseline_archive], encoding="utf8") + sp.run(["tar", "xJf", baseline_archive], check=True) + + # Print a short summary of paths, we don't use `tar v` since the list is huge + short_paths = re.findall(r"^(?:[^/\n]+/?){1,3}", all_paths, re.MULTILINE) + + print("Extracted:") + for path in sorted(set(short_paths)): + print(f"* {path}") + eprint("baseline extracted successfully") diff --git a/library/compiler-builtins/compiler-builtins/Cargo.toml b/library/compiler-builtins/compiler-builtins/Cargo.toml index 8bbe136ce33e3..0845861dcfe3c 100644 --- a/library/compiler-builtins/compiler-builtins/Cargo.toml +++ b/library/compiler-builtins/compiler-builtins/Cargo.toml @@ -7,7 +7,12 @@ [package] name = "compiler_builtins" version = "0.1.160" -authors = ["Jorge Aparicio "] +authors = [ + "Alex Crichton ", + "Amanieu d'Antras ", + "Jorge Aparicio ", + "Trevor Gross ", +] description = "Compiler intrinsics used by the Rust compiler." repository = "https://github.com/rust-lang/compiler-builtins" license = "MIT AND Apache-2.0 WITH LLVM-exception AND (MIT OR Apache-2.0)" diff --git a/library/compiler-builtins/compiler-builtins/src/int/specialized_div_rem/mod.rs b/library/compiler-builtins/compiler-builtins/src/int/specialized_div_rem/mod.rs index 7841e4f33cd66..5ffe1f59b4db6 100644 --- a/library/compiler-builtins/compiler-builtins/src/int/specialized_div_rem/mod.rs +++ b/library/compiler-builtins/compiler-builtins/src/int/specialized_div_rem/mod.rs @@ -196,13 +196,12 @@ unsafe fn u128_by_u64_div_rem(duo: u128, div: u64) -> (u64, u64) { unsafe { // divides the combined registers rdx:rax (`duo` is split into two 64 bit parts to do this) // by `div`. The quotient is stored in rax and the remainder in rdx. - // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust. core::arch::asm!( "div {0}", in(reg) div, inlateout("rax") duo_lo => quo, inlateout("rdx") duo_hi => rem, - options(att_syntax, pure, nomem, nostack) + options(pure, nomem, nostack), ); } (quo, rem) @@ -283,13 +282,12 @@ unsafe fn u64_by_u32_div_rem(duo: u64, div: u32) -> (u32, u32) { unsafe { // divides the combined registers rdx:rax (`duo` is split into two 32 bit parts to do this) // by `div`. The quotient is stored in rax and the remainder in rdx. - // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust. core::arch::asm!( "div {0}", in(reg) div, inlateout("rax") duo_lo => quo, inlateout("rdx") duo_hi => rem, - options(att_syntax, pure, nomem, nostack) + options(pure, nomem, nostack), ); } (quo, rem) diff --git a/library/compiler-builtins/compiler-builtins/src/mem/x86_64.rs b/library/compiler-builtins/compiler-builtins/src/mem/x86_64.rs index fb29eb11b231e..bf36a286ac951 100644 --- a/library/compiler-builtins/compiler-builtins/src/mem/x86_64.rs +++ b/library/compiler-builtins/compiler-builtins/src/mem/x86_64.rs @@ -22,13 +22,12 @@ use core::{intrinsics, mem}; #[inline(always)] #[cfg(target_feature = "ermsb")] pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) { - // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust. - core::arch::asm!( - "repe movsb (%rsi), (%rdi)", + asm!( + "rep movsb [rdi], [rsi]", inout("rcx") count => _, inout("rdi") dest => _, inout("rsi") src => _, - options(att_syntax, nostack, preserves_flags) + options(nostack, preserves_flags) ); } @@ -42,21 +41,21 @@ pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, count: usize) inout("ecx") pre_byte_count => _, inout("rdi") dest => dest, inout("rsi") src => src, - options(att_syntax, nostack, preserves_flags) + options(nostack, preserves_flags) ); asm!( "rep movsq", inout("rcx") qword_count => _, inout("rdi") dest => dest, inout("rsi") src => src, - options(att_syntax, nostack, preserves_flags) + options(nostack, preserves_flags) ); asm!( "rep movsb", inout("ecx") byte_count => _, inout("rdi") dest => _, inout("rsi") src => _, - options(att_syntax, nostack, preserves_flags) + options(nostack, preserves_flags) ); } @@ -67,14 +66,13 @@ pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) { asm!( "std", "rep movsb", - "sub $7, %rsi", - "sub $7, %rdi", - "mov {qword_count:r}, %rcx", + "sub rsi, 7", + "sub rdi, 7", + "mov rcx, {qword_count:r}", "rep movsq", - "test {pre_byte_count:e}, {pre_byte_count:e}", - "add $7, %rsi", - "add $7, %rdi", - "mov {pre_byte_count:e}, %ecx", + "add rsi, 7", + "add rdi, 7", + "mov ecx, {pre_byte_count:e}", "rep movsb", "cld", pre_byte_count = in(reg) pre_byte_count, @@ -82,21 +80,19 @@ pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) { inout("ecx") byte_count => _, inout("rdi") dest.add(count - 1) => _, inout("rsi") src.add(count - 1) => _, - // We modify flags, but we restore it afterwards - options(att_syntax, nostack, preserves_flags) + options(nostack) ); } #[inline(always)] #[cfg(target_feature = "ermsb")] pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) { - // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust. - core::arch::asm!( - "repe stosb %al, (%rdi)", + asm!( + "rep stosb [rdi], al", inout("rcx") count => _, inout("rdi") dest => _, inout("al") c => _, - options(att_syntax, nostack, preserves_flags) + options(nostack, preserves_flags) ) } @@ -111,21 +107,21 @@ pub unsafe fn set_bytes(mut dest: *mut u8, c: u8, count: usize) { inout("ecx") pre_byte_count => _, inout("rdi") dest => dest, in("rax") c, - options(att_syntax, nostack, preserves_flags) + options(nostack, preserves_flags) ); asm!( "rep stosq", inout("rcx") qword_count => _, inout("rdi") dest => dest, in("rax") c, - options(att_syntax, nostack, preserves_flags) + options(nostack, preserves_flags) ); asm!( "rep stosb", inout("ecx") byte_count => _, inout("rdi") dest => _, in("rax") c, - options(att_syntax, nostack, preserves_flags) + options(nostack, preserves_flags) ); } @@ -212,10 +208,10 @@ pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize { let x = { let r; asm!( - "movdqa ({addr:r}), {dest}", + "movdqa {dest}, [{addr:r}]", addr = in(reg) s, dest = out(xmm_reg) r, - options(att_syntax, nostack), + options(nostack, preserves_flags), ); r }; @@ -232,10 +228,10 @@ pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize { let x = { let r; asm!( - "movdqa ({addr:r}), {dest}", + "movdqa {dest}, [{addr:r}]", addr = in(reg) s, dest = out(xmm_reg) r, - options(att_syntax, nostack), + options(nostack, preserves_flags), ); r }; @@ -277,10 +273,10 @@ pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize { let mut cs = { let r: u64; asm!( - "mov ({addr}), {dest}", + "mov {dest}, [{addr}]", addr = in(reg) s, dest = out(reg) r, - options(att_syntax, nostack), + options(nostack, preserves_flags), ); r }; diff --git a/library/compiler-builtins/compiler-builtins/src/probestack.rs b/library/compiler-builtins/compiler-builtins/src/probestack.rs index 72975485a7765..1cab64ea113c5 100644 --- a/library/compiler-builtins/compiler-builtins/src/probestack.rs +++ b/library/compiler-builtins/compiler-builtins/src/probestack.rs @@ -47,11 +47,11 @@ // We only define stack probing for these architectures today. #![cfg(any(target_arch = "x86_64", target_arch = "x86"))] -// Our goal here is to touch each page between %rsp+8 and %rsp+8-%rax, +// Our goal here is to touch each page between `rsp+8` and `rsp+8-rax`, // ensuring that if any pages are unmapped we'll make a page fault. // -// The ABI here is that the stack frame size is located in `%rax`. Upon -// return we're not supposed to modify `%rsp` or `%rax`. +// The ABI here is that the stack frame size is located in `rax`. Upon +// return we're not supposed to modify `rsp` or `rax`. #[cfg(target_arch = "x86_64")] #[unsafe(naked)] #[rustc_std_internal_symbol] @@ -59,50 +59,50 @@ pub unsafe extern "custom" fn __rust_probestack() { core::arch::naked_asm!( " .cfi_startproc - pushq %rbp + push rbp .cfi_adjust_cfa_offset 8 - .cfi_offset %rbp, -16 - movq %rsp, %rbp - .cfi_def_cfa_register %rbp + .cfi_offset rbp, -16 + mov rbp, rsp + .cfi_def_cfa_register rbp - mov %rax,%r11 // duplicate %rax as we're clobbering %r11 + mov r11, rax // duplicate rax as we're clobbering r11 // Main loop, taken in one page increments. We're decrementing rsp by // a page each time until there's less than a page remaining. We're // guaranteed that this function isn't called unless there's more than a // page needed. // - // Note that we're also testing against `8(%rsp)` to account for the 8 + // Note that we're also testing against `[rsp + 8]` to account for the 8 // bytes pushed on the stack originally with our return address. Using - // `8(%rsp)` simulates us testing the stack pointer in the caller's + // `[rsp + 8]` simulates us testing the stack pointer in the caller's // context. - // It's usually called when %rax >= 0x1000, but that's not always true. + // It's usually called when rax >= 0x1000, but that's not always true. // Dynamic stack allocation, which is needed to implement unsized - // rvalues, triggers stackprobe even if %rax < 0x1000. - // Thus we have to check %r11 first to avoid segfault. - cmp $0x1000,%r11 + // rvalues, triggers stackprobe even if rax < 0x1000. + // Thus we have to check r11 first to avoid segfault. + cmp r11, 0x1000 jna 3f 2: - sub $0x1000,%rsp - test %rsp,8(%rsp) - sub $0x1000,%r11 - cmp $0x1000,%r11 + sub rsp, 0x1000 + test qword ptr [rsp + 8], rsp + sub r11, 0x1000 + cmp r11, 0x1000 ja 2b 3: // Finish up the last remaining stack space requested, getting the last // bits out of r11 - sub %r11,%rsp - test %rsp,8(%rsp) + sub rsp, r11 + test qword ptr [rsp + 8], rsp // Restore the stack pointer to what it previously was when entering // this function. The caller will readjust the stack pointer after we // return. - add %rax,%rsp + add rsp, rax leave - .cfi_def_cfa_register %rsp + .cfi_def_cfa_register rsp .cfi_adjust_cfa_offset -8 ", #[cfg(not(all(target_env = "sgx", target_vendor = "fortanix")))] @@ -112,14 +112,13 @@ pub unsafe extern "custom" fn __rust_probestack() { // for this target, [manually patch for LVI]. // // [manually patch for LVI]: https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection#specialinstructions - pop %r11 + pop r11 lfence - jmp *%r11 + jmp r11 ", " .cfi_endproc ", - options(att_syntax) ) } @@ -135,36 +134,35 @@ pub unsafe extern "custom" fn __rust_probestack() { core::arch::naked_asm!( " .cfi_startproc - push %ebp + push ebp .cfi_adjust_cfa_offset 4 - .cfi_offset %ebp, -8 - mov %esp, %ebp - .cfi_def_cfa_register %ebp - push %ecx - mov %eax,%ecx + .cfi_offset ebp, -8 + mov ebp, esp + .cfi_def_cfa_register ebp + push ecx + mov ecx, eax - cmp $0x1000,%ecx + cmp ecx, 0x1000 jna 3f 2: - sub $0x1000,%esp - test %esp,8(%esp) - sub $0x1000,%ecx - cmp $0x1000,%ecx + sub esp, 0x1000 + test dword ptr [esp + 8], esp + sub ecx, 0x1000 + cmp ecx, 0x1000 ja 2b 3: - sub %ecx,%esp - test %esp,8(%esp) + sub esp, ecx + test dword ptr [esp + 8], esp - add %eax,%esp - pop %ecx + add esp, eax + pop ecx leave - .cfi_def_cfa_register %esp + .cfi_def_cfa_register esp .cfi_adjust_cfa_offset -4 ret .cfi_endproc - ", - options(att_syntax) + ", ) } @@ -176,8 +174,8 @@ pub unsafe extern "custom" fn __rust_probestack() { // REF: Rust commit(74e80468347) // rust\src\llvm-project\llvm\lib\Target\X86\X86FrameLowering.cpp: 805 // Comments in LLVM: -// MSVC x32's _chkstk and cygwin/mingw's _alloca adjust %esp themselves. -// MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp +// MSVC x32's _chkstk and cygwin/mingw's _alloca adjust esp themselves. +// MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust `rsp` // themselves. #[unsafe(naked)] #[rustc_std_internal_symbol] @@ -185,40 +183,39 @@ pub unsafe extern "custom" fn __rust_probestack() { core::arch::naked_asm!( " .cfi_startproc - push %ebp + push ebp .cfi_adjust_cfa_offset 4 - .cfi_offset %ebp, -8 - mov %esp, %ebp - .cfi_def_cfa_register %ebp - push %ecx - push %edx - mov %eax,%ecx - - cmp $0x1000,%ecx + .cfi_offset ebp, -8 + mov ebp, esp + .cfi_def_cfa_register ebp + push ecx + push edx + mov ecx, eax + + cmp ecx, 0x1000 jna 3f 2: - sub $0x1000,%esp - test %esp,8(%esp) - sub $0x1000,%ecx - cmp $0x1000,%ecx + sub esp, 0x1000 + test dword ptr [esp + 8], esp + sub ecx, 0x1000 + cmp ecx, 0x1000 ja 2b 3: - sub %ecx,%esp - test %esp,8(%esp) - mov 4(%ebp),%edx - mov %edx, 12(%esp) - add %eax,%esp - pop %edx - pop %ecx + sub esp, ecx + test dword ptr [esp + 8], esp + mov edx, dword ptr [ebp + 4] + mov dword ptr [esp + 12], edx + add esp, eax + pop edx + pop ecx leave - sub %eax, %esp - .cfi_def_cfa_register %esp + sub esp, eax + .cfi_def_cfa_register esp .cfi_adjust_cfa_offset -4 ret .cfi_endproc - ", - options(att_syntax) + ", ) } diff --git a/library/compiler-builtins/compiler-builtins/src/x86.rs b/library/compiler-builtins/compiler-builtins/src/x86.rs index 51940b3b338a2..1a3c418609451 100644 --- a/library/compiler-builtins/compiler-builtins/src/x86.rs +++ b/library/compiler-builtins/compiler-builtins/src/x86.rs @@ -22,26 +22,25 @@ intrinsics! { pub unsafe extern "custom" fn _alloca() { // __chkstk and _alloca are the same function core::arch::naked_asm!( - "push %ecx", - "cmp $0x1000,%eax", - "lea 8(%esp),%ecx", // esp before calling this routine -> ecx - "jb 1f", + "push ecx", + "cmp eax, 0x1000", + "lea ecx, [esp + 8]", // esp before calling this routine -> ecx + "jb 3f", "2:", - "sub $0x1000,%ecx", - "test %ecx,(%ecx)", - "sub $0x1000,%eax", - "cmp $0x1000,%eax", + "sub ecx, 0x1000", + "test [ecx], ecx", + "sub eax, 0x1000", + "cmp eax, 0x1000", "ja 2b", - "1:", - "sub %eax,%ecx", - "test %ecx,(%ecx)", - "lea 4(%esp),%eax", // load pointer to the return address into eax - "mov %ecx,%esp", // install the new top of stack pointer into esp - "mov -4(%eax),%ecx", // restore ecx - "push (%eax)", // push return address onto the stack - "sub %esp,%eax", // restore the original value in eax + "3:", + "sub ecx, eax", + "test [ecx], ecx", + "lea eax, [esp + 4]", // load pointer to the return address into eax + "mov esp, ecx", // install the new top of stack pointer into esp + "mov ecx, [eax - 4]", // restore ecx + "push [eax]", // push return address onto the stack + "sub eax, esp", // restore the original value in eax "ret", - options(att_syntax) ); } } diff --git a/library/compiler-builtins/compiler-builtins/src/x86_64.rs b/library/compiler-builtins/compiler-builtins/src/x86_64.rs index f9ae784d57520..99a527ee9ac5e 100644 --- a/library/compiler-builtins/compiler-builtins/src/x86_64.rs +++ b/library/compiler-builtins/compiler-builtins/src/x86_64.rs @@ -12,24 +12,23 @@ intrinsics! { #[cfg(any(all(windows, target_env = "gnu"), target_os = "cygwin", target_os = "uefi"))] pub unsafe extern "custom" fn ___chkstk_ms() { core::arch::naked_asm!( - "push %rcx", - "push %rax", - "cmp $0x1000,%rax", - "lea 24(%rsp),%rcx", - "jb 1f", + "push rcx", + "push rax", + "cmp rax, 0x1000", + "lea rcx, [rsp + 24]", + "jb 3f", "2:", - "sub $0x1000,%rcx", - "test %rcx,(%rcx)", - "sub $0x1000,%rax", - "cmp $0x1000,%rax", + "sub rcx, 0x1000", + "test [rcx], rcx", + "sub rax, 0x1000", + "cmp rax, 0x1000", "ja 2b", - "1:", - "sub %rax,%rcx", - "test %rcx,(%rcx)", - "pop %rax", - "pop %rcx", + "3:", + "sub rcx, rax", + "test [rcx], rcx", + "pop rax", + "pop rcx", "ret", - options(att_syntax) ); } } diff --git a/library/compiler-builtins/crates/panic-handler/Cargo.toml b/library/compiler-builtins/crates/panic-handler/Cargo.toml index a6764fc481b64..70898368d8e73 100644 --- a/library/compiler-builtins/crates/panic-handler/Cargo.toml +++ b/library/compiler-builtins/crates/panic-handler/Cargo.toml @@ -1,7 +1,6 @@ [package] name = "panic-handler" version = "0.1.0" -authors = ["Alex Crichton "] edition = "2024" publish = false diff --git a/library/compiler-builtins/libm-test/Cargo.toml b/library/compiler-builtins/libm-test/Cargo.toml index 0af6b0c1da5ca..adecfc1af6b87 100644 --- a/library/compiler-builtins/libm-test/Cargo.toml +++ b/library/compiler-builtins/libm-test/Cargo.toml @@ -21,8 +21,8 @@ build-musl = ["dep:musl-math-sys"] # Enable report generation without bringing in more dependencies by default benchmarking-reports = ["criterion/plotters", "criterion/html_reports"] -# Enable icount benchmarks (requires iai-callgrind and valgrind) -icount = ["dep:iai-callgrind"] +# Enable icount benchmarks (requires gungraun-runner and valgrind locally) +icount = ["dep:gungraun"] # Run with a reduced set of benchmarks, such as for CI short-benchmarks = [] @@ -31,7 +31,7 @@ short-benchmarks = [] anyhow = "1.0.98" # This is not directly used but is required so we can enable `gmp-mpfr-sys/force-cross`. gmp-mpfr-sys = { version = "1.6.5", optional = true, default-features = false } -iai-callgrind = { version = "0.15.2", optional = true } +gungraun = { version = "0.17.0", optional = true } indicatif = { version = "0.18.0", default-features = false } libm = { path = "../libm", features = ["unstable-public-internals"] } libm-macros = { path = "../crates/libm-macros" } diff --git a/library/compiler-builtins/libm-test/benches/icount.rs b/library/compiler-builtins/libm-test/benches/icount.rs index 0b85771225dde..fb856d9be4517 100644 --- a/library/compiler-builtins/libm-test/benches/icount.rs +++ b/library/compiler-builtins/libm-test/benches/icount.rs @@ -1,10 +1,10 @@ -//! Benchmarks that use `iai-cachegrind` to be reasonably CI-stable. +//! Benchmarks that use `gungraun` to be reasonably CI-stable. #![feature(f16)] #![feature(f128)] use std::hint::black_box; -use iai_callgrind::{library_benchmark, library_benchmark_group, main}; +use gungraun::{library_benchmark, library_benchmark_group, main}; use libm::support::{HInt, Hexf, hf16, hf32, hf64, hf128, u256}; use libm_test::generate::spaced; use libm_test::{CheckBasis, CheckCtx, GeneratorKind, MathOp, OpRustArgs, TupleCall, op}; @@ -156,7 +156,13 @@ fn icount_bench_u256_shr(cases: Vec<(u256, u32)>) { library_benchmark_group!( name = icount_bench_u128_group; - benchmarks = icount_bench_u128_widen_mul, icount_bench_u256_narrowing_div, icount_bench_u256_add, icount_bench_u256_sub, icount_bench_u256_shl, icount_bench_u256_shr + benchmarks = + icount_bench_u128_widen_mul, + icount_bench_u256_narrowing_div, + icount_bench_u256_add, + icount_bench_u256_sub, + icount_bench_u256_shl, + icount_bench_u256_shr ); #[library_benchmark] diff --git a/library/compiler-builtins/libm-test/src/precision.rs b/library/compiler-builtins/libm-test/src/precision.rs index c441922d302b9..7887c032394b8 100644 --- a/library/compiler-builtins/libm-test/src/precision.rs +++ b/library/compiler-builtins/libm-test/src/precision.rs @@ -83,6 +83,19 @@ pub fn default_ulp(ctx: &CheckCtx) -> u32 { Bn::Tgamma => 20, }; + // These have a separate implementation on i586 + if cfg!(x86_no_sse) { + match ctx.fn_ident { + Id::Exp => ulp = 1, + Id::Exp2 => ulp = 1, + Id::Exp10 => ulp = 1, + Id::Expf => ulp = 0, + Id::Exp2f => ulp = 0, + Id::Exp10f => ulp = 0, + _ => (), + } + } + // There are some cases where musl's approximation is less accurate than ours. For these // cases, increase the ULP. if ctx.basis == Musl { @@ -98,6 +111,8 @@ pub fn default_ulp(ctx: &CheckCtx) -> u32 { Id::Cbrt => ulp = 2, // FIXME(#401): musl has an incorrect result here. Id::Fdim => ulp = 2, + Id::Exp2f => ulp = 1, + Id::Expf => ulp = 1, Id::Sincosf => ulp = 500, Id::Tgamma => ulp = 20, _ => (), @@ -124,8 +139,6 @@ pub fn default_ulp(ctx: &CheckCtx) -> u32 { Id::Asinh => ulp = 3, Id::Asinhf => ulp = 3, Id::Cbrt => ulp = 1, - Id::Exp10 | Id::Exp10f => ulp = 1_000_000, - Id::Exp2 | Id::Exp2f => ulp = 10_000_000, Id::Log1p | Id::Log1pf => ulp = 2, Id::Tan => ulp = 2, _ => (), @@ -205,36 +218,6 @@ impl MaybeOverride<(f16,)> for SpecialCase {} impl MaybeOverride<(f32,)> for SpecialCase { fn check_float(input: (f32,), actual: F, expected: F, ctx: &CheckCtx) -> CheckAction { - if ctx.base_name == BaseName::Expm1 - && !input.0.is_infinite() - && input.0 > 80.0 - && actual.is_infinite() - && !expected.is_infinite() - { - // we return infinity but the number is representable - if ctx.basis == CheckBasis::Musl { - return XFAIL_NOCHECK; - } - return XFAIL("expm1 representable numbers"); - } - - if cfg!(x86_no_sse) - && ctx.base_name == BaseName::Exp2 - && !expected.is_infinite() - && actual.is_infinite() - { - // We return infinity when there is a representable value. Test input: 127.97238 - return XFAIL("586 exp2 representable numbers"); - } - - if ctx.base_name == BaseName::Sinh && input.0.abs() > 80.0 && actual.is_nan() { - // we return some NaN that should be real values or infinite - if ctx.basis == CheckBasis::Musl { - return XFAIL_NOCHECK; - } - return XFAIL("sinh unexpected NaN"); - } - if (ctx.base_name == BaseName::Lgamma || ctx.base_name == BaseName::LgammaR) && input.0 > 4e36 && expected.is_infinite() @@ -278,14 +261,6 @@ impl MaybeOverride<(f64,)> for SpecialCase { return XFAIL("i586 rint rounding mode"); } - if cfg!(x86_no_sse) - && (ctx.fn_ident == Identifier::Exp10 || ctx.fn_ident == Identifier::Exp2) - { - // FIXME: i586 has very imprecise results with ULP > u32::MAX for these - // operations so we can't reasonably provide a limit. - return XFAIL_NOCHECK; - } - if ctx.base_name == BaseName::J0 && input.0 < -1e300 { // Errors get huge close to -inf return XFAIL_NOCHECK; diff --git a/library/compiler-builtins/libm/Cargo.toml b/library/compiler-builtins/libm/Cargo.toml index 63b4d3c277989..5b5ca34fd2c9e 100644 --- a/library/compiler-builtins/libm/Cargo.toml +++ b/library/compiler-builtins/libm/Cargo.toml @@ -1,7 +1,12 @@ [package] name = "libm" version = "0.2.15" -authors = ["Jorge Aparicio "] +authors = [ + "Alex Crichton ", + "Amanieu d'Antras ", + "Jorge Aparicio ", + "Trevor Gross ", +] description = "libm in pure Rust" categories = ["no-std"] keywords = ["libm", "math"] diff --git a/library/compiler-builtins/libm/src/math/arch/i586.rs b/library/compiler-builtins/libm/src/math/arch/i586.rs index b9a66762063db..d9bb93fbf5852 100644 --- a/library/compiler-builtins/libm/src/math/arch/i586.rs +++ b/library/compiler-builtins/libm/src/math/arch/i586.rs @@ -60,3 +60,62 @@ pub fn floor(mut x: f64) -> f64 { } x } +/// Implements the exponential functions with `x87` assembly. +/// +/// This relies on the instruction `f2xm1`, which computes `2^x - 1` (for +/// |x| < 1). This transcendental instruction is documented to produce results +/// with error below 1ulp (in the native double-extended precision format). This +/// translates to correctly rounded results for f32, but results in f64 may have +/// 1ulp error, which may depend on the hardware. +macro_rules! x87exp { + ($float_ty:ident, $word_size:literal, $fn_name:ident, $load_op:literal) => { + pub fn $fn_name(mut x: $float_ty) -> $float_ty { unsafe { + core::arch::asm!( + // Prepare the register stack as + // ``` + // st(0) = y = x*log2(base) + // st(1) = 1.0 + // st(2) = round(y) + // ``` + concat!($load_op, " ", $word_size, " ptr [{x}]"), + "fld1", + "fld st(1)", + "frndint", + "fxch st(2)", + + // Compare y with round(y) to determine if y is finite and + // not an integer. If so, compute `exp2(y - round(y))` into + // st(1). Otherwise skip ahead with `st(1) = 1.0` + "fucom st(2)", + "fstsw ax", + "test ax, 0x4000", + "jnz 2f", + "fsub st(0), st(2)", // st(0) = y - round(y) + "f2xm1", // st(0) = 2^st(0) - 1.0 + "fadd st(1), st(0)", // st(1) = 1 + st(0) = exp2(y - round(y)) + "2:", + + // Finally, scale by `exp2(round(y))` and clear the stack. + "fstp st(0)", + "fscale", + concat!("fstp ", $word_size, " ptr [{x}]"), + "fstp st(0)", + x = in(reg) &mut x, + out("ax") _, + out("st(0)") _, out("st(1)") _, + out("st(2)") _, out("st(3)") _, + out("st(4)") _, out("st(5)") _, + out("st(6)") _, out("st(7)") _, + options(nostack), + ); + x + }} + }; +} + +x87exp!(f32, "dword", x87_exp2f, "fld"); +x87exp!(f64, "qword", x87_exp2, "fld"); +x87exp!(f32, "dword", x87_exp10f, "fldl2t\nfmul"); +x87exp!(f64, "qword", x87_exp10, "fldl2t\nfmul"); +x87exp!(f32, "dword", x87_expf, "fldl2e\nfmul"); +x87exp!(f64, "qword", x87_exp, "fldl2e\nfmul"); diff --git a/library/compiler-builtins/libm/src/math/arch/mod.rs b/library/compiler-builtins/libm/src/math/arch/mod.rs index 984ae7f3129f9..ba859c679d0db 100644 --- a/library/compiler-builtins/libm/src/math/arch/mod.rs +++ b/library/compiler-builtins/libm/src/math/arch/mod.rs @@ -48,3 +48,8 @@ cfg_if! { pub use i586::{ceil, floor}; } } +cfg_if! { + if #[cfg(x86_no_sse)] { + pub use i586::{x87_exp10f, x87_exp10, x87_expf, x87_exp, x87_exp2f, x87_exp2}; + } +} diff --git a/library/compiler-builtins/libm/src/math/exp.rs b/library/compiler-builtins/libm/src/math/exp.rs index 78ce5dd134ac3..cb939ad5d8bf2 100644 --- a/library/compiler-builtins/libm/src/math/exp.rs +++ b/library/compiler-builtins/libm/src/math/exp.rs @@ -83,6 +83,12 @@ const P5: f64 = 4.13813679705723846039e-08; /* 0x3E663769, 0x72BEA4D0 */ /// (where *e* is the base of the natural system of logarithms, approximately 2.71828). #[cfg_attr(assert_no_panic, no_panic::no_panic)] pub fn exp(mut x: f64) -> f64 { + select_implementation! { + name: x87_exp, + use_arch_required: x86_no_sse, + args: x, + } + let x1p1023 = f64::from_bits(0x7fe0000000000000); // 0x1p1023 === 2 ^ 1023 let x1p_149 = f64::from_bits(0x36a0000000000000); // 0x1p-149 === 2 ^ -149 diff --git a/library/compiler-builtins/libm/src/math/exp10.rs b/library/compiler-builtins/libm/src/math/exp10.rs index 1f49f5e96979c..e0af1945b922a 100644 --- a/library/compiler-builtins/libm/src/math/exp10.rs +++ b/library/compiler-builtins/libm/src/math/exp10.rs @@ -9,6 +9,12 @@ const P10: &[f64] = &[ /// Calculates 10 raised to the power of `x` (f64). #[cfg_attr(assert_no_panic, no_panic::no_panic)] pub fn exp10(x: f64) -> f64 { + select_implementation! { + name: x87_exp10, + use_arch_required: x86_no_sse, + args: x, + } + let (mut y, n) = modf(x); let u: u64 = n.to_bits(); /* fabs(n) < 16 without raising invalid on nan */ diff --git a/library/compiler-builtins/libm/src/math/exp10f.rs b/library/compiler-builtins/libm/src/math/exp10f.rs index 22a264211d03e..f0a311c2d1915 100644 --- a/library/compiler-builtins/libm/src/math/exp10f.rs +++ b/library/compiler-builtins/libm/src/math/exp10f.rs @@ -9,6 +9,12 @@ const P10: &[f32] = &[ /// Calculates 10 raised to the power of `x` (f32). #[cfg_attr(assert_no_panic, no_panic::no_panic)] pub fn exp10f(x: f32) -> f32 { + select_implementation! { + name: x87_exp10f, + use_arch_required: x86_no_sse, + args: x, + } + let (mut y, n) = modff(x); let u = n.to_bits(); /* fabsf(n) < 8 without raising invalid on nan */ diff --git a/library/compiler-builtins/libm/src/math/exp2.rs b/library/compiler-builtins/libm/src/math/exp2.rs index 6e4cbc29dcc99..08b71587f6de5 100644 --- a/library/compiler-builtins/libm/src/math/exp2.rs +++ b/library/compiler-builtins/libm/src/math/exp2.rs @@ -324,6 +324,12 @@ static TBL: [u64; TBLSIZE * 2] = [ /// Calculate `2^x`, that is, 2 raised to the power `x`. #[cfg_attr(assert_no_panic, no_panic::no_panic)] pub fn exp2(mut x: f64) -> f64 { + select_implementation! { + name: x87_exp2, + use_arch_required: x86_no_sse, + args: x, + } + let redux = f64::from_bits(0x4338000000000000) / TBLSIZE as f64; let p1 = f64::from_bits(0x3fe62e42fefa39ef); let p2 = f64::from_bits(0x3fcebfbdff82c575); diff --git a/library/compiler-builtins/libm/src/math/exp2f.rs b/library/compiler-builtins/libm/src/math/exp2f.rs index 733d2f1a84738..ceff6822c5969 100644 --- a/library/compiler-builtins/libm/src/math/exp2f.rs +++ b/library/compiler-builtins/libm/src/math/exp2f.rs @@ -75,6 +75,12 @@ static EXP2FT: [u64; TBLSIZE] = [ /// Calculate `2^x`, that is, 2 raised to the power `x`. #[cfg_attr(assert_no_panic, no_panic::no_panic)] pub fn exp2f(mut x: f32) -> f32 { + select_implementation! { + name: x87_exp2f, + use_arch_required: x86_no_sse, + args: x, + } + let redux = f32::from_bits(0x4b400000) / TBLSIZE as f32; let p1 = f32::from_bits(0x3f317218); let p2 = f32::from_bits(0x3e75fdf0); diff --git a/library/compiler-builtins/libm/src/math/expf.rs b/library/compiler-builtins/libm/src/math/expf.rs index dbbfdbba9253b..5541ab79a9c14 100644 --- a/library/compiler-builtins/libm/src/math/expf.rs +++ b/library/compiler-builtins/libm/src/math/expf.rs @@ -32,6 +32,12 @@ const P2: f32 = -2.7667332906e-3; /* -0xb55215.0p-32 */ /// (where *e* is the base of the natural system of logarithms, approximately 2.71828). #[cfg_attr(assert_no_panic, no_panic::no_panic)] pub fn expf(mut x: f32) -> f32 { + select_implementation! { + name: x87_expf, + use_arch_required: x86_no_sse, + args: x, + } + let x1p127 = f32::from_bits(0x7f000000); // 0x1p127f === 2 ^ 127 let x1p_126 = f32::from_bits(0x800000); // 0x1p-126f === 2 ^ -126 /*original 0x1p-149f ??????????? */ let mut hx = x.to_bits(); diff --git a/library/compiler-builtins/libm/src/math/expm1f.rs b/library/compiler-builtins/libm/src/math/expm1f.rs index f77515a4b99b3..388da3f30173d 100644 --- a/library/compiler-builtins/libm/src/math/expm1f.rs +++ b/library/compiler-builtins/libm/src/math/expm1f.rs @@ -13,7 +13,6 @@ * ==================================================== */ -const O_THRESHOLD: f32 = 8.8721679688e+01; /* 0x42b17180 */ const LN2_HI: f32 = 6.9313812256e-01; /* 0x3f317180 */ const LN2_LO: f32 = 9.0580006145e-06; /* 0x3717f7d1 */ const INV_LN2: f32 = 1.4426950216e+00; /* 0x3fb8aa3b */ @@ -50,7 +49,8 @@ pub fn expm1f(mut x: f32) -> f32 { if sign { return -1.; } - if x > O_THRESHOLD { + if hx > 0x42b17217 { + /* x > log(FLT_MAX) */ x *= x1p127; return x; } diff --git a/library/compiler-builtins/libm/src/math/generic/fmod.rs b/library/compiler-builtins/libm/src/math/generic/fmod.rs index 29acc8a4d5df5..3c3fd44b27cc2 100644 --- a/library/compiler-builtins/libm/src/math/generic/fmod.rs +++ b/library/compiler-builtins/libm/src/math/generic/fmod.rs @@ -1,8 +1,12 @@ /* SPDX-License-Identifier: MIT OR Apache-2.0 */ -use crate::support::{CastFrom, Float, Int, MinInt}; +use crate::support::{CastFrom, CastInto, Float, HInt, Int, MinInt, NarrowingDiv}; #[inline] -pub fn fmod(x: F, y: F) -> F { +pub fn fmod(x: F, y: F) -> F +where + F::Int: HInt, + ::D: NarrowingDiv, +{ let _1 = F::Int::ONE; let sx = x.to_bits() & F::SIGN_MASK; let ux = x.to_bits() & !F::SIGN_MASK; @@ -29,7 +33,7 @@ pub fn fmod(x: F, y: F) -> F { // To compute `(num << ex) % (div << ey)`, first // evaluate `rem = (num << (ex - ey)) % div` ... - let rem = reduction(num, ex - ey, div); + let rem = reduction::(num, ex - ey, div); // ... so the result will be `rem << ey` if rem.is_zero() { @@ -58,11 +62,55 @@ fn into_sig_exp(mut bits: F::Int) -> (F::Int, u32) { } /// Compute the remainder `(x * 2.pow(e)) % y` without overflow. -fn reduction(mut x: I, e: u32, y: I) -> I { - x %= y; - for _ in 0..e { - x <<= 1; - x = x.checked_sub(y).unwrap_or(x); +fn reduction(mut x: F::Int, e: u32, y: F::Int) -> F::Int +where + F: Float, + F::Int: HInt, + <::Int as HInt>::D: NarrowingDiv, +{ + // `f16` only has 5 exponent bits, so even `f16::MAX = 65504.0` is only + // a 40-bit integer multiple of the smallest subnormal. + if F::BITS == 16 { + debug_assert!(F::EXP_MAX - F::EXP_MIN == 29); + debug_assert!(e <= 29); + let u: u16 = x.cast(); + let v: u16 = y.cast(); + let u = (u as u64) << e; + let v = v as u64; + return F::Int::cast_from((u % v) as u16); } - x + + // Ensure `x < 2y` for later steps + if x >= (y << 1) { + // This case is only reached with subnormal divisors, + // but it might be better to just normalize all significands + // to make this unnecessary. The further calls could potentially + // benefit from assuming a specific fixed leading bit position. + x %= y; + } + + // The simple implementation seems to be fastest for a short reduction + // at this size. The limit here was chosen empirically on an Intel Nehalem. + // Less old CPUs that have faster `u64 * u64 -> u128` might not benefit, + // and 32-bit systems or architectures without hardware multipliers might + // want to do this in more cases. + if F::BITS == 64 && e < 32 { + // Assumes `x < 2y` + for _ in 0..e { + x = x.checked_sub(y).unwrap_or(x); + x <<= 1; + } + return x.checked_sub(y).unwrap_or(x); + } + + // Fast path for short reductions + if e < F::BITS { + let w = x.widen() << e; + if let Some((_, r)) = w.checked_narrowing_div_rem(y) { + return r; + } + } + + // Assumes `x < 2y` + crate::support::linear_mul_reduction(x, e, y) } diff --git a/library/compiler-builtins/libm/src/math/generic/scalbn.rs b/library/compiler-builtins/libm/src/math/generic/scalbn.rs index 6dd9b1a9b84a4..68de41757913a 100644 --- a/library/compiler-builtins/libm/src/math/generic/scalbn.rs +++ b/library/compiler-builtins/libm/src/math/generic/scalbn.rs @@ -96,14 +96,14 @@ where // Work aroudn this by using a different algorithm that calculates the prescale // dynamically based on the maximum possible value. This adds more operations per round // since it needs to construct the scale, but works better in the general case. - let add = -(n + sig_total_bits as i32).clamp(exp_min, sig_total_bits as i32); + let add = -(n + sig_total_bits as i32).max(exp_min); let mul = F::from_parts(false, (F::EXP_BIAS as i32 - add) as u32, zero); x *= mul; n += add; if n < exp_min { - let add = -(n + sig_total_bits as i32).clamp(exp_min, sig_total_bits as i32); + let add = -(n + sig_total_bits as i32).max(exp_min); let mul = F::from_parts(false, (F::EXP_BIAS as i32 - add) as u32, zero); x *= mul; diff --git a/library/compiler-builtins/libm/src/math/support/int_traits.rs b/library/compiler-builtins/libm/src/math/support/int_traits.rs index f1aa1e5b9b4d2..55b609affd2e6 100644 --- a/library/compiler-builtins/libm/src/math/support/int_traits.rs +++ b/library/compiler-builtins/libm/src/math/support/int_traits.rs @@ -296,7 +296,14 @@ int_impl!(i128, u128); /// Trait for integers twice the bit width of another integer. This is implemented for all /// primitives except for `u8`, because there is not a smaller primitive. -pub trait DInt: MinInt { +pub trait DInt: + MinInt + + ops::Add + + ops::Sub + + ops::Shl + + ops::Shr + + Ord +{ /// Integer that is half the bit width of the integer this trait is implemented for type H: HInt; diff --git a/library/compiler-builtins/libm/src/math/support/int_traits/narrowing_div.rs b/library/compiler-builtins/libm/src/math/support/int_traits/narrowing_div.rs index 3da0843cc5408..e76fc5ae9f4ca 100644 --- a/library/compiler-builtins/libm/src/math/support/int_traits/narrowing_div.rs +++ b/library/compiler-builtins/libm/src/math/support/int_traits/narrowing_div.rs @@ -7,7 +7,6 @@ use crate::support::{CastInto, DInt, HInt, Int, MinInt, u256}; /// This is the inverse of widening multiplication: /// - for any `x` and nonzero `y`: `x.widen_mul(y).checked_narrowing_div_rem(y) == Some((x, 0))`, /// - and for any `r in 0..y`: `x.carrying_mul(y, r).checked_narrowing_div_rem(y) == Some((x, r))`, -#[allow(dead_code)] pub trait NarrowingDiv: DInt + MinInt { /// Computes `(self / n, self % n))` /// diff --git a/library/compiler-builtins/libm/src/math/support/mod.rs b/library/compiler-builtins/libm/src/math/support/mod.rs index 7b529eb760b73..15ab010dc8d5f 100644 --- a/library/compiler-builtins/libm/src/math/support/mod.rs +++ b/library/compiler-builtins/libm/src/math/support/mod.rs @@ -8,6 +8,7 @@ pub(crate) mod feature_detect; mod float_traits; pub mod hex_float; mod int_traits; +mod modular; #[allow(unused_imports)] pub use big::{i256, u256}; @@ -28,8 +29,8 @@ pub use hex_float::hf16; pub use hex_float::hf128; #[allow(unused_imports)] pub use hex_float::{hf32, hf64}; -#[allow(unused_imports)] pub use int_traits::{CastFrom, CastInto, DInt, HInt, Int, MinInt, NarrowingDiv}; +pub use modular::linear_mul_reduction; /// Hint to the compiler that the current path is cold. pub fn cold_path() { diff --git a/library/compiler-builtins/libm/src/math/support/modular.rs b/library/compiler-builtins/libm/src/math/support/modular.rs new file mode 100644 index 0000000000000..cc0edf2f2bc04 --- /dev/null +++ b/library/compiler-builtins/libm/src/math/support/modular.rs @@ -0,0 +1,304 @@ +/* SPDX-License-Identifier: MIT OR Apache-2.0 */ + +//! This module provides accelerated modular multiplication by large powers +//! of two, which is needed for computing floating point remainders in `fmod` +//! and similar functions. +//! +//! To keep the equations somewhat concise, the following conventions are used: +//! - all integer operations are in the mathematical sense, without overflow +//! - concatenation means multiplication: `2xq = 2 * x * q` +//! - `R = (1 << U::BITS)` is the modulus of wrapping arithmetic in `U` + +use crate::support::int_traits::NarrowingDiv; +use crate::support::{DInt, HInt, Int}; + +/// Compute the remainder `(x << e) % y` with unbounded integers. +/// Requires `x < 2y` and `y.leading_zeros() >= 2` +pub fn linear_mul_reduction(x: U, mut e: u32, mut y: U) -> U +where + U: HInt + Int, + U::D: NarrowingDiv, +{ + assert!(y <= U::MAX >> 2); + assert!(x < (y << 1)); + let _0 = U::ZERO; + let _1 = U::ONE; + + // power of two divisors + if (y & (y - _1)).is_zero() { + if e < U::BITS { + // shift and only keep low bits + return (x << e) & (y - _1); + } else { + // would shift out all the bits + return _0; + } + } + + // Use the identity `(x << e) % y == ((x << (e + s)) % (y << s)) >> s` + // to shift the divisor so it has exactly two leading zeros to satisfy + // the precondition of `Reducer::new` + let s = y.leading_zeros() - 2; + e += s; + y <<= s; + + // `m: Reducer` keeps track of the remainder `x` in a form that makes it + // very efficient to do `x <<= k` modulo `y` for integers `k < U::BITS` + let mut m = Reducer::new(x, y); + + // Use the faster special case with constant `k == U::BITS - 1` while we can + while e >= U::BITS - 1 { + m.word_reduce(); + e -= U::BITS - 1; + } + // Finish with the variable shift operation + m.shift_reduce(e); + + // The partial remainder is in `[0, 2y)` ... + let r = m.partial_remainder(); + // ... so check and correct, and compensate for the earlier shift. + r.checked_sub(y).unwrap_or(r) >> s +} + +/// Helper type for computing the reductions. The implementation has a number +/// of seemingly weird choices, but everything is aimed at streamlining +/// `Reducer::word_reduce` into its current form. +/// +/// Implicitly contains: +/// n in (R/8, R/4) +/// x in [0, 2n) +/// The value of `n` is fixed for a given `Reducer`, +/// but the value of `x` is modified by the methods. +#[derive(Debug, Clone, PartialEq, Eq)] +struct Reducer { + // m = 2n + m: U, + // q = (RR/2) / m + // r = (RR/2) % m + // Then RR/2 = qm + r, where `0 <= r < m` + // The value `q` is only needed during construction, so isn't saved. + r: U, + // The value `x` is implicitly stored as `2 * q * x`: + _2xq: U::D, +} + +impl Reducer +where + U: HInt, + U: Int, +{ + /// Construct a reducer for `(x << _) mod n`. + /// + /// Requires `R/8 < n < R/4` and `x < 2n`. + fn new(x: U, n: U) -> Self + where + U::D: NarrowingDiv, + { + let _1 = U::ONE; + assert!(n > (_1 << (U::BITS - 3))); + assert!(n < (_1 << (U::BITS - 2))); + let m = n << 1; + assert!(x < m); + + // We need to compute the parameters + // `q = (RR/2) / m` + // `r = (RR/2) % m` + + // Since `m` is in `(R/4, R/2)`, the quotient `q` is in `[R, 2R)`, and + // it would overflow in `U` if computed directly. Instead, we compute + // `f = q - R`, which is in `[0, R)`. To do so, we simply subtract `Rm` + // from the dividend, which doesn't change the remainder: + // `f = R(R/2 - m) / m` + // `r = R(R/2 - m) % m` + let dividend = ((_1 << (U::BITS - 1)) - m).widen_hi(); + let (f, r) = dividend.checked_narrowing_div_rem(m).unwrap(); + + // As `x < m`, `xq < qm <= RR/2` + // Thus `2xq = 2xR + 2xf` does not overflow in `U::D`. + let _2x = x + x; + let _2xq = _2x.widen_hi() + _2x.widen_mul(f); + Self { m, r, _2xq } + } + + /// Extract the current remainder `x` in the range `[0, 2n)` + fn partial_remainder(&self) -> U { + // `RR/2 = qm + r`, where `0 <= r < m` + // `2xq = uR + v`, where `0 <= v < R` + + // The goal is to extract the current value of `x` from the value `2xq` + // that we actually have. A bit simplified, we could multiply it by `m` + // to obtain `2xqm == 2x(RR/2 - r) == xRR - 2xr`, where `2xr < RR`. + // We could just round that up to the next multiple of `RR` to get `x`, + // but we can avoid having to multiply the full double-wide `2xq` by + // making a couple of adjustments: + + // First, let's only use the high half `u` for the product, and + // include an additional error term due to the truncation: + // `mu = xR - (2xr + mv)/R` + + // Next, show bounds for the error term + // `0 <= mv < mR` follows from `0 <= v < R` + // `0 <= 2xr < mR` follows from `0 <= x < m < R/2` and `0 <= r < m` + // Adding those together, we have: + // `0 <= (mv + 2xr)/R < 2m` + // Which also implies: + // `0 < 2m - (mv + 2xr)/R <= 2m < R` + + // For that reason, we can use `u + 2` as the factor to obtain + // `m(u + 2) = xR + (2m - (mv + 2xr)/R)` + // By the previous inequality, the second term fits neatly in the lower + // half, so we get exactly `x` as the high half. + let u = self._2xq.hi(); + let _2 = U::ONE + U::ONE; + self.m.widen_mul(u + _2).hi() + + // Additionally, we should ensure that `u + 2` cannot overflow: + // Since `x < m` and `2qm <= RR`, + // `2xq <= 2q(m-1) <= RR - 2q` + // As we also have `q > R`, + // `2xq < RR - 2R` + // which is sufficient. + } + + /// Replace the remainder `x` with `(x << k) - un`, + /// for a suitable quotient `u`, which is returned. + /// + /// Requires that `k < U::BITS`. + fn shift_reduce(&mut self, k: u32) -> U { + assert!(k < U::BITS); + + // First, split the shifted value: + // `2xq << k = aRR/2 + b`, where `0 <= b < RR/2` + let a = self._2xq.hi() >> (U::BITS - 1 - k); + let (low, high) = (self._2xq << k).lo_hi(); + let b = U::D::from_lo_hi(low, high & (U::MAX >> 1)); + + // Then, subtract `2anq = aqm`: + // ``` + // (2xq << k) - aqm + // = aRR/2 + b - aqm + // = a(RR/2 - qm) + b + // = ar + b + // ``` + self._2xq = a.widen_mul(self.r) + b; + a + + // Since `a` is at most the high half of `2xq`, we have + // `a + 2 < R` (shown above, in `partial_remainder`) + // Using that together with `b < RR/2` and `r < m < R/2`, + // we get `(a + 2)r + b < RR`, so + // `ar + b < RR - 2r = 2mq` + // which shows that the new remainder still satisfies `x < m`. + } + + // NB: `word_reduce()` is just the special case `shift_reduce(U::BITS - 1)` + // that optimizes especially well. The correspondence is that `a == u` and + // `b == (v >> 1).widen_hi()` + // + /// Replace the remainder `x` with `x(R/2) - un`, + /// for a suitable quotient `u`, which is returned. + fn word_reduce(&mut self) -> U { + // To do so, we replace `2xq = uR + v` with + // ``` + // 2 * (x(R/2) - un) * q + // = xqR - 2unq + // = xqR - uqm + // = uRR/2 + vR/2 - uRR/2 + ur + // = ur + (v/2)R + // ``` + let (v, u) = self._2xq.lo_hi(); + self._2xq = u.widen_mul(self.r) + U::widen_hi(v >> 1); + u + + // Additional notes: + // 1. As `v` is the low bits of `2xq`, it is even and can be halved. + // 2. The new remainder is `(xr + mv/2) / R` (see below) + // and since `v < R`, `r < m`, `x < m < R/2`, + // that is also strictly less than `m`. + // ``` + // (x(R/2) - un)R + // = xRR/2 - (m/2)uR + // = x(qm + r) - (m/2)(2xq - v) + // = xqm + xr - xqm + mv/2 + // = xr + mv/2 + // ``` + } +} + +#[cfg(test)] +mod test { + use crate::support::linear_mul_reduction; + use crate::support::modular::Reducer; + + #[test] + fn reducer_ops() { + for n in 33..=63_u8 { + for x in 0..2 * n { + let temp = Reducer::new(x, n); + let n = n as u32; + let x0 = temp.partial_remainder() as u32; + assert_eq!(x as u32, x0); + for k in 0..=7 { + let mut red = temp.clone(); + let u = red.shift_reduce(k) as u32; + let x1 = red.partial_remainder() as u32; + assert_eq!(x1, (x0 << k) - u * n); + assert!(x1 < 2 * n); + assert!((red._2xq as u32).is_multiple_of(2 * x1)); + + // `word_reduce` is equivalent to + // `shift_reduce(U::BITS - 1)` + if k == 7 { + let mut alt = temp.clone(); + let w = alt.word_reduce(); + assert_eq!(u, w as u32); + assert_eq!(alt, red); + } + } + } + } + } + #[test] + fn reduction_u8() { + for y in 1..64u8 { + for x in 0..2 * y { + let mut r = x % y; + for e in 0..100 { + assert_eq!(r, linear_mul_reduction(x, e, y)); + // maintain the correct expected remainder + r <<= 1; + if r >= y { + r -= y; + } + } + } + } + } + #[test] + fn reduction_u128() { + assert_eq!( + linear_mul_reduction::(17, 100, 123456789), + (17 << 100) % 123456789 + ); + + // power-of-two divisor + assert_eq!( + linear_mul_reduction(0xdead_beef, 100, 1_u128 << 116), + 0xbeef << 100 + ); + + let x = 10_u128.pow(37); + let y = 11_u128.pow(36); + assert!(x < y); + let mut r = x; + for e in 0..1000 { + assert_eq!(r, linear_mul_reduction(x, e, y)); + // maintain the correct expected remainder + r <<= 1; + if r >= y { + r -= y; + } + assert!(r != 0); + } + } +} diff --git a/library/compiler-builtins/rust-version b/library/compiler-builtins/rust-version index 71fbbbaa984f6..7345c25066a82 100644 --- a/library/compiler-builtins/rust-version +++ b/library/compiler-builtins/rust-version @@ -1 +1 @@ -47cd7120d9b4e1b64eb27c87522a07888197fae8 +2dc30247c5d8293aaa31e1d7dae2ed2fde908ada diff --git a/library/compiler-builtins/triagebot.toml b/library/compiler-builtins/triagebot.toml index eba5cdd88b941..b210a5fb52563 100644 --- a/library/compiler-builtins/triagebot.toml +++ b/library/compiler-builtins/triagebot.toml @@ -12,10 +12,11 @@ exclude_titles = ["Rustc pull update"] [issue-links] check-commits = false -# Prevents mentions in commits to avoid users being spammed -# Documentation at: https://forge.rust-lang.org/triagebot/no-mentions.html -[no-mentions] - # Enable issue transfers within the org # Documentation at: https://forge.rust-lang.org/triagebot/transfer.html [transfer] + +# Enable comments linking to triagebot range-diff when a PR is rebased +# onto a different base commit +# Documentation at: https://forge.rust-lang.org/triagebot/range-diff.html +[range-diff]