diff --git a/src/int/leading_zeros.rs b/src/int/leading_zeros.rs index 78556f0bc..e4a9e5eb2 100644 --- a/src/int/leading_zeros.rs +++ b/src/int/leading_zeros.rs @@ -4,6 +4,7 @@ // Compilers will insert the check for zero in cases where it is needed. /// Returns the number of leading binary zeros in `x`. +#[doc(hidden)] pub fn usize_leading_zeros_default(x: usize) -> usize { // The basic idea is to test if the higher bits of `x` are zero and bisect the number // of leading zeros. It is possible for all branches of the bisection to use the same @@ -75,6 +76,7 @@ pub fn usize_leading_zeros_default(x: usize) -> usize { // RISC-V that allows `(x >= power-of-two) as usize` to be branchless. /// Returns the number of leading binary zeros in `x`. +#[doc(hidden)] pub fn usize_leading_zeros_riscv(x: usize) -> usize { let mut x = x; // the number of potential leading zeros diff --git a/src/int/sdiv.rs b/src/int/sdiv.rs index 3d0c3afc1..e1e3f33bb 100644 --- a/src/int/sdiv.rs +++ b/src/int/sdiv.rs @@ -1,65 +1,166 @@ -use int::specialized_div_rem::*; +use int::udiv::*; -intrinsics! { - #[maybe_use_optimized_c_shim] - #[arm_aeabi_alias = __aeabi_idiv] - /// Returns `n / d` - pub extern "C" fn __divsi3(a: i32, b: i32) -> i32 { - i32_div_rem(a, b).0 - } - - #[maybe_use_optimized_c_shim] - /// Returns `n % d` - pub extern "C" fn __modsi3(a: i32, b: i32) -> i32 { - i32_div_rem(a, b).1 - } - - #[maybe_use_optimized_c_shim] - /// Returns `n / d` and sets `*rem = n % d` - pub extern "C" fn __divmodsi4(a: i32, b: i32, rem: &mut i32) -> i32 { - let quo_rem = i32_div_rem(a, b); - *rem = quo_rem.1; - quo_rem.0 +macro_rules! sdivmod { + ( + $unsigned_fn:ident, // name of the unsigned division function + $signed_fn:ident, // name of the signed division function + $uX:ident, // unsigned integer type for the inputs and outputs of `$unsigned_name` + $iX:ident, // signed integer type for the inputs and outputs of `$signed_name` + $($attr:tt),* // attributes + ) => { + intrinsics! { + $( + #[$attr] + )* + /// Returns `n / d` and sets `*rem = n % d` + pub extern "C" fn $signed_fn(a: $iX, b: $iX, rem: &mut $iX) -> $iX { + let a_neg = a < 0; + let b_neg = b < 0; + let mut a = a; + let mut b = b; + if a_neg { + a = a.wrapping_neg(); + } + if b_neg { + b = b.wrapping_neg(); + } + let mut r = *rem as $uX; + let t = $unsigned_fn(a as $uX, b as $uX, Some(&mut r)) as $iX; + let mut r = r as $iX; + if a_neg { + r = r.wrapping_neg(); + } + *rem = r; + if a_neg != b_neg { + t.wrapping_neg() + } else { + t + } + } + } } +} - #[maybe_use_optimized_c_shim] - /// Returns `n / d` - pub extern "C" fn __divdi3(a: i64, b: i64) -> i64 { - i64_div_rem(a, b).0 +macro_rules! sdiv { + ( + $unsigned_fn:ident, // name of the unsigned division function + $signed_fn:ident, // name of the signed division function + $uX:ident, // unsigned integer type for the inputs and outputs of `$unsigned_name` + $iX:ident, // signed integer type for the inputs and outputs of `$signed_name` + $($attr:tt),* // attributes + ) => { + intrinsics! { + $( + #[$attr] + )* + /// Returns `n / d` + pub extern "C" fn $signed_fn(a: $iX, b: $iX) -> $iX { + let a_neg = a < 0; + let b_neg = b < 0; + let mut a = a; + let mut b = b; + if a_neg { + a = a.wrapping_neg(); + } + if b_neg { + b = b.wrapping_neg(); + } + let t = $unsigned_fn(a as $uX, b as $uX) as $iX; + if a_neg != b_neg { + t.wrapping_neg() + } else { + t + } + } + } } +} - #[maybe_use_optimized_c_shim] - /// Returns `n % d` - pub extern "C" fn __moddi3(a: i64, b: i64) -> i64 { - i64_div_rem(a, b).1 +macro_rules! smod { + ( + $unsigned_fn:ident, // name of the unsigned division function + $signed_fn:ident, // name of the signed division function + $uX:ident, // unsigned integer type for the inputs and outputs of `$unsigned_name` + $iX:ident, // signed integer type for the inputs and outputs of `$signed_name` + $($attr:tt),* // attributes + ) => { + intrinsics! { + $( + #[$attr] + )* + /// Returns `n % d` + pub extern "C" fn $signed_fn(a: $iX, b: $iX) -> $iX { + let a_neg = a < 0; + let b_neg = b < 0; + let mut a = a; + let mut b = b; + if a_neg { + a = a.wrapping_neg(); + } + if b_neg { + b = b.wrapping_neg(); + } + let r = $unsigned_fn(a as $uX, b as $uX) as $iX; + if a_neg { + r.wrapping_neg() + } else { + r + } + } + } } +} +sdivmod!( + __udivmodsi4, + __divmodsi4, + u32, + i32, + maybe_use_optimized_c_shim +); +// The `#[arm_aeabi_alias = __aeabi_idiv]` attribute cannot be made to work with `intrinsics!` in macros +intrinsics! { #[maybe_use_optimized_c_shim] - /// Returns `n / d` and sets `*rem = n % d` - pub extern "C" fn __divmoddi4(a: i64, b: i64, rem: &mut i64) -> i64 { - let quo_rem = i64_div_rem(a, b); - *rem = quo_rem.1; - quo_rem.0 - } - - #[win64_128bit_abi_hack] + #[arm_aeabi_alias = __aeabi_idiv] /// Returns `n / d` - pub extern "C" fn __divti3(a: i128, b: i128) -> i128 { - i128_div_rem(a, b).0 + pub extern "C" fn __divsi3(a: i32, b: i32) -> i32 { + let a_neg = a < 0; + let b_neg = b < 0; + let mut a = a; + let mut b = b; + if a_neg { + a = a.wrapping_neg(); + } + if b_neg { + b = b.wrapping_neg(); + } + let t = __udivsi3(a as u32, b as u32) as i32; + if a_neg != b_neg { + t.wrapping_neg() + } else { + t + } } +} +smod!(__umodsi3, __modsi3, u32, i32, maybe_use_optimized_c_shim); - #[win64_128bit_abi_hack] - /// Returns `n % d` - pub extern "C" fn __modti3(a: i128, b: i128) -> i128 { - i128_div_rem(a, b).1 - } +sdivmod!( + __udivmoddi4, + __divmoddi4, + u64, + i64, + maybe_use_optimized_c_shim +); +sdiv!(__udivdi3, __divdi3, u64, i64, maybe_use_optimized_c_shim); +smod!(__umoddi3, __moddi3, u64, i64, maybe_use_optimized_c_shim); - // LLVM does not currently have a `__divmodti4` function, but GCC does - #[maybe_use_optimized_c_shim] - /// Returns `n / d` and sets `*rem = n % d` - pub extern "C" fn __divmodti4(a: i128, b: i128, rem: &mut i128) -> i128 { - let quo_rem = i128_div_rem(a, b); - *rem = quo_rem.1; - quo_rem.0 - } -} +// LLVM does not currently have a `__divmodti4` function, but GCC does +sdivmod!( + __udivmodti4, + __divmodti4, + u128, + i128, + maybe_use_optimized_c_shim +); +sdiv!(__udivti3, __divti3, u128, i128, win64_128bit_abi_hack); +smod!(__umodti3, __modti3, u128, i128, win64_128bit_abi_hack); diff --git a/src/int/specialized_div_rem/asymmetric.rs b/src/int/specialized_div_rem/asymmetric.rs index 861e91742..45da657e9 100644 --- a/src/int/specialized_div_rem/asymmetric.rs +++ b/src/int/specialized_div_rem/asymmetric.rs @@ -1,44 +1,26 @@ -/// Creates unsigned and signed division functions optimized for dividing integers with the same +/// Creates an unsigned division function optimized for dividing integers with the same /// bitwidth as the largest operand in an asymmetrically sized division. For example, x86-64 has an /// assembly instruction that can divide a 128 bit integer by a 64 bit integer if the quotient fits /// in 64 bits. The 128 bit version of this algorithm would use that fast hardware division to /// construct a full 128 bit by 128 bit division. +#[doc(hidden)] #[macro_export] macro_rules! impl_asymmetric { ( - $unsigned_name:ident, // name of the unsigned division function - $signed_name:ident, // name of the signed division function + $fn:ident, // name of the unsigned division function $zero_div_fn:ident, // function called when division by zero is attempted $half_division:ident, // function for division of a $uX by a $uX $asymmetric_division:ident, // function for division of a $uD by a $uX $n_h:expr, // the number of bits in a $iH or $uH $uH:ident, // unsigned integer with half the bit width of $uX $uX:ident, // unsigned integer with half the bit width of $uD - $uD:ident, // unsigned integer type for the inputs and outputs of `$unsigned_name` - $iD:ident, // signed integer type for the inputs and outputs of `$signed_name` - $($unsigned_attr:meta),*; // attributes for the unsigned function - $($signed_attr:meta),* // attributes for the signed function + $uD:ident // unsigned integer type for the inputs and outputs of `$fn` ) => { /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a /// tuple. - $( - #[$unsigned_attr] - )* - pub fn $unsigned_name(duo: $uD, div: $uD) -> ($uD,$uD) { - fn carrying_mul(lhs: $uX, rhs: $uX) -> ($uX, $uX) { - let tmp = (lhs as $uD).wrapping_mul(rhs as $uD); - (tmp as $uX, (tmp >> ($n_h * 2)) as $uX) - } - fn carrying_mul_add(lhs: $uX, mul: $uX, add: $uX) -> ($uX, $uX) { - let tmp = (lhs as $uD).wrapping_mul(mul as $uD).wrapping_add(add as $uD); - (tmp as $uX, (tmp >> ($n_h * 2)) as $uX) - } - + pub fn $fn(duo: $uD, div: $uD) -> ($uD, $uD) { let n: u32 = $n_h * 2; - // Many of these subalgorithms are taken from trifecta.rs, see that for better - // documentation. - let duo_lo = duo as $uX; let duo_hi = (duo >> n) as $uX; let div_lo = div as $uX; @@ -50,120 +32,39 @@ macro_rules! impl_asymmetric { if duo_hi < div_lo { // `$uD` by `$uX` division with a quotient that will fit into a `$uX` let (quo, rem) = unsafe { $asymmetric_division(duo, div_lo) }; - return (quo as $uD, rem as $uD) - } else if (div_lo >> $n_h) == 0 { - // Short division of $uD by a $uH. - - // Some x86_64 CPUs have bad division implementations that make specializing - // this case faster. - let div_0 = div_lo as $uH as $uX; - let (quo_hi, rem_3) = $half_division(duo_hi, div_0); - - let duo_mid = - ((duo >> $n_h) as $uH as $uX) - | (rem_3 << $n_h); - let (quo_1, rem_2) = $half_division(duo_mid, div_0); - - let duo_lo = - (duo as $uH as $uX) - | (rem_2 << $n_h); - let (quo_0, rem_1) = $half_division(duo_lo, div_0); - - return ( - (quo_0 as $uD) - | ((quo_1 as $uD) << $n_h) - | ((quo_hi as $uD) << n), - rem_1 as $uD - ) + return (quo as $uD, rem as $uD); } else { // Short division using the $uD by $uX division let (quo_hi, rem_hi) = $half_division(duo_hi, div_lo); let tmp = unsafe { $asymmetric_division((duo_lo as $uD) | ((rem_hi as $uD) << n), div_lo) }; - return ((tmp.0 as $uD) | ((quo_hi as $uD) << n), tmp.1 as $uD) + return ((tmp.0 as $uD) | ((quo_hi as $uD) << n), tmp.1 as $uD); } } - let duo_lz = duo_hi.leading_zeros(); + // This has been adapted from + // https://www.codeproject.com/tips/785014/uint-division-modulus which was in turn + // adapted from Hacker's Delight. This is similar to the two possibility algorithm + // in that it uses only more significant parts of `duo` and `div` to divide a large + // integer with a smaller division instruction. let div_lz = div_hi.leading_zeros(); - let rel_leading_sb = div_lz.wrapping_sub(duo_lz); - if rel_leading_sb < $n_h { - // Some x86_64 CPUs have bad hardware division implementations that make putting - // a two possibility algorithm here beneficial. We also avoid a full `$uD` - // multiplication. - let shift = n - duo_lz; - let duo_sig_n = (duo >> shift) as $uX; - let div_sig_n = (div >> shift) as $uX; - let quo = $half_division(duo_sig_n, div_sig_n).0; - let div_lo = div as $uX; - let div_hi = (div >> n) as $uX; - let (tmp_lo, carry) = carrying_mul(quo, div_lo); - let (tmp_hi, overflow) = carrying_mul_add(quo, div_hi, carry); - let tmp = (tmp_lo as $uD) | ((tmp_hi as $uD) << n); - if (overflow != 0) || (duo < tmp) { - return ( - (quo - 1) as $uD, - duo.wrapping_add(div).wrapping_sub(tmp) - ) - } else { - return ( - quo as $uD, - duo - tmp - ) - } - } else { - // This has been adapted from - // https://www.codeproject.com/tips/785014/uint-division-modulus which was in turn - // adapted from Hacker's Delight. This is similar to the two possibility algorithm - // in that it uses only more significant parts of `duo` and `div` to divide a large - // integer with a smaller division instruction. - - let div_extra = n - div_lz; - let div_sig_n = (div >> div_extra) as $uX; - let tmp = unsafe { - $asymmetric_division(duo >> 1, div_sig_n) - }; - - let mut quo = tmp.0 >> ((n - 1) - div_lz); - if quo != 0 { - quo -= 1; - } + let div_extra = n - div_lz; + let div_sig_n = (div >> div_extra) as $uX; + let tmp = unsafe { $asymmetric_division(duo >> 1, div_sig_n) }; - // Note that this is a full `$uD` multiplication being used here - let mut rem = duo - (quo as $uD).wrapping_mul(div); - if div <= rem { - quo += 1; - rem -= div; - } - return (quo as $uD, rem) + let mut quo = tmp.0 >> ((n - 1) - div_lz); + if quo != 0 { + quo -= 1; } - } - /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a - /// tuple. - $( - #[$signed_attr] - )* - pub fn $signed_name(duo: $iD, div: $iD) -> ($iD, $iD) { - match (duo < 0, div < 0) { - (false, false) => { - let t = $unsigned_name(duo as $uD, div as $uD); - (t.0 as $iD, t.1 as $iD) - }, - (true, false) => { - let t = $unsigned_name(duo.wrapping_neg() as $uD, div as $uD); - ((t.0 as $iD).wrapping_neg(), (t.1 as $iD).wrapping_neg()) - }, - (false, true) => { - let t = $unsigned_name(duo as $uD, div.wrapping_neg() as $uD); - ((t.0 as $iD).wrapping_neg(), t.1 as $iD) - }, - (true, true) => { - let t = $unsigned_name(duo.wrapping_neg() as $uD, div.wrapping_neg() as $uD); - (t.0 as $iD, (t.1 as $iD).wrapping_neg()) - }, + // Note that this is a full `$uD` multiplication being used here + let mut rem = duo - (quo as $uD).wrapping_mul(div); + if div <= rem { + quo += 1; + rem -= div; } + return (quo as $uD, rem); } - } + }; } diff --git a/src/int/specialized_div_rem/binary_long.rs b/src/int/specialized_div_rem/binary_long.rs index 4c63396a0..7de10e852 100644 --- a/src/int/specialized_div_rem/binary_long.rs +++ b/src/int/specialized_div_rem/binary_long.rs @@ -1,35 +1,30 @@ -/// Creates unsigned and signed division functions that use binary long division, designed for +/// Creates an unsigned division function that uses binary long division, designed for /// computer architectures without division instructions. These functions have good performance for /// microarchitectures with large branch miss penalties and architectures without the ability to /// predicate instructions. For architectures with predicated instructions, one of the algorithms /// described in the documentation of these functions probably has higher performance, and a custom /// assembly routine should be used instead. +#[doc(hidden)] #[macro_export] macro_rules! impl_binary_long { ( - $unsigned_name:ident, // name of the unsigned division function - $signed_name:ident, // name of the signed division function + $fn:ident, // name of the unsigned division function $zero_div_fn:ident, // function called when division by zero is attempted $normalization_shift:ident, // function for finding the normalization shift $n:tt, // the number of bits in a $iX or $uX - $uX:ident, // unsigned integer type for the inputs and outputs of `$unsigned_name` - $iX:ident, // signed integer type for the inputs and outputs of `$signed_name` - $($unsigned_attr:meta),*; // attributes for the unsigned function - $($signed_attr:meta),* // attributes for the signed function + $uX:ident, // unsigned integer type for the inputs and outputs of `$fn` + $iX:ident // signed integer type with same bitwidth as `$uX` ) => { /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a /// tuple. - $( - #[$unsigned_attr] - )* - pub fn $unsigned_name(duo: $uX, div: $uX) -> ($uX, $uX) { + pub fn $fn(duo: $uX, div: $uX) -> ($uX, $uX) { let mut duo = duo; // handle edge cases before calling `$normalization_shift` if div == 0 { $zero_div_fn() } if duo < div { - return (0, duo) + return (0, duo); } // There are many variations of binary division algorithm that could be used. This @@ -430,7 +425,7 @@ macro_rules! impl_binary_long { let mut i = shl; loop { if i == 0 { - break + break; } i -= 1; // shift left 1 and subtract @@ -550,47 +545,5 @@ macro_rules! impl_binary_long { return ((duo & mask) | quo, duo >> shl); */ } - - /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a - /// tuple. - $( - #[$signed_attr] - )* - pub fn $signed_name(duo: $iX, div: $iX) -> ($iX, $iX) { - // There is a way of doing this without any branches, but requires too many extra - // operations to be faster. - /* - let duo_s = duo >> ($n - 1); - let div_s = div >> ($n - 1); - let duo = (duo ^ duo_s).wrapping_sub(duo_s); - let div = (div ^ div_s).wrapping_sub(div_s); - let quo_s = duo_s ^ div_s; - let rem_s = duo_s; - let tmp = $unsigned_name(duo as $uX, div as $uX); - ( - ((tmp.0 as $iX) ^ quo_s).wrapping_sub(quo_s), - ((tmp.1 as $iX) ^ rem_s).wrapping_sub(rem_s), - ) - */ - - match (duo < 0, div < 0) { - (false, false) => { - let t = $unsigned_name(duo as $uX, div as $uX); - (t.0 as $iX, t.1 as $iX) - }, - (true, false) => { - let t = $unsigned_name(duo.wrapping_neg() as $uX, div as $uX); - ((t.0 as $iX).wrapping_neg(), (t.1 as $iX).wrapping_neg()) - }, - (false, true) => { - let t = $unsigned_name(duo as $uX, div.wrapping_neg() as $uX); - ((t.0 as $iX).wrapping_neg(), t.1 as $iX) - }, - (true, true) => { - let t = $unsigned_name(duo.wrapping_neg() as $uX, div.wrapping_neg() as $uX); - (t.0 as $iX, (t.1 as $iX).wrapping_neg()) - }, - } - } - } + }; } diff --git a/src/int/specialized_div_rem/delegate.rs b/src/int/specialized_div_rem/delegate.rs index 1ba72431d..8310c1429 100644 --- a/src/int/specialized_div_rem/delegate.rs +++ b/src/int/specialized_div_rem/delegate.rs @@ -1,29 +1,24 @@ -/// Creates unsigned and signed division functions that use a combination of hardware division and +/// Creates an unsigned division function that uses a combination of hardware division and /// binary long division to divide integers larger than what hardware division by itself can do. This /// function is intended for microarchitectures that have division hardware, but not fast enough /// multiplication hardware for `impl_trifecta` to be faster. +#[doc(hidden)] #[macro_export] macro_rules! impl_delegate { ( - $unsigned_name:ident, // name of the unsigned division function - $signed_name:ident, // name of the signed division function + $fn:ident, // name of the unsigned division function $zero_div_fn:ident, // function called when division by zero is attempted $half_normalization_shift:ident, // function for finding the normalization shift of $uX $half_division:ident, // function for division of a $uX by a $uX $n_h:expr, // the number of bits in $iH or $uH $uH:ident, // unsigned integer with half the bit width of $uX $uX:ident, // unsigned integer with half the bit width of $uD. - $uD:ident, // unsigned integer type for the inputs and outputs of `$unsigned_name` - $iD:ident, // signed integer type for the inputs and outputs of `$signed_name` - $($unsigned_attr:meta),*; // attributes for the unsigned function - $($signed_attr:meta),* // attributes for the signed function + $uD:ident, // unsigned integer type for the inputs and outputs of `$fn` + $iD:ident // signed integer type with the same bitwidth as `$uD` ) => { /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a /// tuple. - $( - #[$unsigned_attr] - )* - pub fn $unsigned_name(duo: $uD, div: $uD) -> ($uD, $uD) { + pub fn $fn(duo: $uD, div: $uD) -> ($uD, $uD) { // The two possibility algorithm, undersubtracting long division algorithm, or any kind // of reciprocal based algorithm will not be fastest, because they involve large // multiplications that we assume to not be fast enough relative to the divisions to @@ -38,17 +33,15 @@ macro_rules! impl_delegate { let div_hi = (div >> n) as $uX; match (div_lo == 0, div_hi == 0, duo_hi == 0) { - (true, true, _) => { - $zero_div_fn() - } + (true, true, _) => $zero_div_fn(), (_, false, true) => { // `duo` < `div` - return (0, duo) + return (0, duo); } (false, true, true) => { // delegate to smaller division let tmp = $half_division(duo_lo, div_lo); - return (tmp.0 as $uD, tmp.1 as $uD) + return (tmp.0 as $uD, tmp.1 as $uD); } (false, true, false) => { if duo_hi < div_lo { @@ -96,7 +89,7 @@ macro_rules! impl_delegate { // Delegate to get the rest of the quotient. Note that the // `div_lo` here is the original unshifted `div`. let tmp = $half_division(duo as $uX, div_lo); - return ((quo_lo | tmp.0) as $uD, tmp.1 as $uD) + return ((quo_lo | tmp.0) as $uD, tmp.1 as $uD); } } div >>= 1; @@ -105,7 +98,7 @@ macro_rules! impl_delegate { } else if duo_hi == div_lo { // `quo_hi == 1`. This branch is cheap and helps with edge cases. let tmp = $half_division(duo as $uX, div as $uX); - return ((1 << n) | (tmp.0 as $uD), tmp.1 as $uD) + return ((1 << n) | (tmp.0 as $uD), tmp.1 as $uD); } else { // `div_lo < duo_hi` // `rem_hi == 0` @@ -114,22 +107,16 @@ macro_rules! impl_delegate { let div_0 = div_lo as $uH as $uX; let (quo_hi, rem_3) = $half_division(duo_hi, div_0); - let duo_mid = - ((duo >> $n_h) as $uH as $uX) - | (rem_3 << $n_h); + let duo_mid = ((duo >> $n_h) as $uH as $uX) | (rem_3 << $n_h); let (quo_1, rem_2) = $half_division(duo_mid, div_0); - let duo_lo = - (duo as $uH as $uX) - | (rem_2 << $n_h); + let duo_lo = (duo as $uH as $uX) | (rem_2 << $n_h); let (quo_0, rem_1) = $half_division(duo_lo, div_0); return ( - (quo_0 as $uD) - | ((quo_1 as $uD) << $n_h) - | ((quo_hi as $uD) << n), - rem_1 as $uD - ) + (quo_0 as $uD) | ((quo_1 as $uD) << $n_h) | ((quo_hi as $uD) << n), + rem_1 as $uD, + ); } // This is basically a short division composed of a half division for the hi @@ -161,7 +148,7 @@ macro_rules! impl_delegate { let tmp = $half_division(duo as $uX, div_lo); return ( (tmp.0) as $uD | (quo_lo as $uD) | ((quo_hi as $uD) << n), - tmp.1 as $uD + tmp.1 as $uD, ); } } @@ -187,7 +174,7 @@ macro_rules! impl_delegate { duo = sub; quo_lo |= pow_lo; if duo < div_original { - return (quo_lo as $uD, duo) + return (quo_lo as $uD, duo); } } div >>= 1; @@ -196,31 +183,5 @@ macro_rules! impl_delegate { } } } - - /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a - /// tuple. - $( - #[$signed_attr] - )* - pub fn $signed_name(duo: $iD, div: $iD) -> ($iD, $iD) { - match (duo < 0, div < 0) { - (false, false) => { - let t = $unsigned_name(duo as $uD, div as $uD); - (t.0 as $iD, t.1 as $iD) - }, - (true, false) => { - let t = $unsigned_name(duo.wrapping_neg() as $uD, div as $uD); - ((t.0 as $iD).wrapping_neg(), (t.1 as $iD).wrapping_neg()) - }, - (false, true) => { - let t = $unsigned_name(duo as $uD, div.wrapping_neg() as $uD); - ((t.0 as $iD).wrapping_neg(), t.1 as $iD) - }, - (true, true) => { - let t = $unsigned_name(duo.wrapping_neg() as $uD, div.wrapping_neg() as $uD); - (t.0 as $iD, (t.1 as $iD).wrapping_neg()) - }, - } - } - } + }; } diff --git a/src/int/specialized_div_rem/mod.rs b/src/int/specialized_div_rem/mod.rs index f7dc044fa..3ac341b6f 100644 --- a/src/int/specialized_div_rem/mod.rs +++ b/src/int/specialized_div_rem/mod.rs @@ -111,13 +111,6 @@ fn u64_by_u64_div_rem(duo: u64, div: u64) -> (u64, u64) { zero_div_fn() } -// `inline(never)` is placed on unsigned division functions so that there are just three division -// functions (`u32_div_rem`, `u64_div_rem`, and `u128_div_rem`) backing all `compiler-builtins` -// division functions. The signed functions like `i32_div_rem` will get inlined into the -// `compiler-builtins` signed division functions, so that they directly call the three division -// functions. Otherwise, LLVM may try to inline the unsigned division functions 4 times into the -// signed division functions, which results in an explosion in code size. - // Whether `trifecta` or `delegate` is faster for 128 bit division depends on the speed at which a // microarchitecture can multiply and divide. We decide to be optimistic and assume `trifecta` is // faster if the target pointer width is at least 64. @@ -127,16 +120,12 @@ fn u64_by_u64_div_rem(duo: u64, div: u64) -> (u64, u64) { ))] impl_trifecta!( u128_div_rem, - i128_div_rem, zero_div_fn, u64_by_u64_div_rem, 32, u32, u64, - u128, - i128, - inline(never); - inline + u128 ); // If the pointer width less than 64, then the target architecture almost certainly does not have @@ -147,7 +136,6 @@ impl_trifecta!( ))] impl_delegate!( u128_div_rem, - i128_div_rem, zero_div_fn, u64_normalization_shift, u64_by_u64_div_rem, @@ -155,9 +143,7 @@ impl_delegate!( u32, u64, u128, - i128, - inline(never); - inline + i128 ); /// Divides `duo` by `div` and returns a tuple of the quotient and the remainder. @@ -191,17 +177,13 @@ unsafe fn u128_by_u64_div_rem(duo: u128, div: u64) -> (u64, u64) { #[cfg(all(feature = "asm", target_arch = "x86_64"))] impl_asymmetric!( u128_div_rem, - i128_div_rem, zero_div_fn, u64_by_u64_div_rem, u128_by_u64_div_rem, 32, u32, u64, - u128, - i128, - inline(never); - inline + u128 ); /// Divides `duo` by `div` and returns a tuple of the quotient and the remainder. @@ -226,7 +208,6 @@ fn u32_by_u32_div_rem(duo: u32, div: u32) -> (u32, u32) { ))] impl_delegate!( u64_div_rem, - i64_div_rem, zero_div_fn, u32_normalization_shift, u32_by_u32_div_rem, @@ -234,9 +215,7 @@ impl_delegate!( u16, u32, u64, - i64, - inline(never); - inline + i64 ); // When not on x86 and the pointer width is 64, use `binary_long`. @@ -246,14 +225,11 @@ impl_delegate!( ))] impl_binary_long!( u64_div_rem, - i64_div_rem, zero_div_fn, u64_normalization_shift, 64, u64, - i64, - inline(never); - inline + i64 ); /// Divides `duo` by `div` and returns a tuple of the quotient and the remainder. @@ -287,28 +263,21 @@ unsafe fn u64_by_u32_div_rem(duo: u64, div: u32) -> (u32, u32) { #[cfg(all(feature = "asm", target_arch = "x86"))] impl_asymmetric!( u64_div_rem, - i64_div_rem, zero_div_fn, u32_by_u32_div_rem, u64_by_u32_div_rem, 16, u16, u32, - u64, - i64, - inline(never); - inline + u64 ); // 32 bits is the smallest division used by `compiler-builtins`, so we end with binary long division impl_binary_long!( u32_div_rem, - i32_div_rem, zero_div_fn, u32_normalization_shift, 32, u32, - i32, - inline(never); - inline + i32 ); diff --git a/src/int/specialized_div_rem/norm_shift.rs b/src/int/specialized_div_rem/norm_shift.rs index 33348b373..be95d1b92 100644 --- a/src/int/specialized_div_rem/norm_shift.rs +++ b/src/int/specialized_div_rem/norm_shift.rs @@ -1,4 +1,5 @@ /// Creates a function used by some division algorithms to compute the "normalization shift". +#[doc(hidden)] #[macro_export] macro_rules! impl_normalization_shift { ( diff --git a/src/int/specialized_div_rem/trifecta.rs b/src/int/specialized_div_rem/trifecta.rs index e76516f34..a9ea60301 100644 --- a/src/int/specialized_div_rem/trifecta.rs +++ b/src/int/specialized_div_rem/trifecta.rs @@ -1,28 +1,22 @@ -/// Creates unsigned and signed division functions optimized for division of integers with bitwidths +/// Creates an unsigned division function optimized for division of integers with bitwidths /// larger than the largest hardware integer division supported. These functions use large radix /// division algorithms that require both fast division and very fast widening multiplication on the /// target microarchitecture. Otherwise, `impl_delegate` should be used instead. +#[doc(hidden)] #[macro_export] macro_rules! impl_trifecta { ( - $unsigned_name:ident, // name of the unsigned division function - $signed_name:ident, // name of the signed division function + $fn:ident, // name of the unsigned division function $zero_div_fn:ident, // function called when division by zero is attempted $half_division:ident, // function for division of a $uX by a $uX $n_h:expr, // the number of bits in $iH or $uH $uH:ident, // unsigned integer with half the bit width of $uX $uX:ident, // unsigned integer with half the bit width of $uD - $uD:ident, // unsigned integer type for the inputs and outputs of `$unsigned_name` - $iD:ident, // signed integer type for the inputs and outputs of `$signed_name` - $($unsigned_attr:meta),*; // attributes for the unsigned function - $($signed_attr:meta),* // attributes for the signed function + $uD:ident // unsigned integer type for the inputs and outputs of `$unsigned_name` ) => { /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a /// tuple. - $( - #[$unsigned_attr] - )* - pub fn $unsigned_name(duo: $uD, div: $uD) -> ($uD, $uD) { + pub fn $fn(duo: $uD, div: $uD) -> ($uD, $uD) { // This is called the trifecta algorithm because it uses three main algorithms: short // division for small divisors, the two possibility algorithm for large divisors, and an // undersubtracting long division algorithm for intermediate cases. @@ -34,7 +28,9 @@ macro_rules! impl_trifecta { (tmp as $uX, (tmp >> ($n_h * 2)) as $uX) } fn carrying_mul_add(lhs: $uX, mul: $uX, add: $uX) -> ($uX, $uX) { - let tmp = (lhs as $uD).wrapping_mul(mul as $uD).wrapping_add(add as $uD); + let tmp = (lhs as $uD) + .wrapping_mul(mul as $uD) + .wrapping_add(add as $uD); (tmp as $uX, (tmp >> ($n_h * 2)) as $uX) } @@ -62,9 +58,9 @@ macro_rules! impl_trifecta { // The quotient cannot be more than 1. The highest set bit of `duo` needs to be at // least one place higher than `div` for the quotient to be more than 1. if duo >= div { - return (1, duo - div) + return (1, duo - div); } else { - return (0, duo) + return (0, duo); } } @@ -76,10 +72,7 @@ macro_rules! impl_trifecta { // `duo < 2^n` so it will fit in a $uX. `div` will also fit in a $uX (because of the // `div_lz <= duo_lz` branch) so no numerical error. let (quo, rem) = $half_division(duo as $uX, div as $uX); - return ( - quo as $uD, - rem as $uD - ) + return (quo as $uD, rem as $uD); } // `{2^n, 2^div_sb} <= duo < 2^n_d` @@ -99,22 +92,16 @@ macro_rules! impl_trifecta { let div_0 = div as $uH as $uX; let (quo_hi, rem_3) = $half_division(duo_hi, div_0); - let duo_mid = - ((duo >> $n_h) as $uH as $uX) - | (rem_3 << $n_h); + let duo_mid = ((duo >> $n_h) as $uH as $uX) | (rem_3 << $n_h); let (quo_1, rem_2) = $half_division(duo_mid, div_0); - let duo_lo = - (duo as $uH as $uX) - | (rem_2 << $n_h); + let duo_lo = (duo as $uH as $uX) | (rem_2 << $n_h); let (quo_0, rem_1) = $half_division(duo_lo, div_0); return ( - (quo_0 as $uD) - | ((quo_1 as $uD) << $n_h) - | ((quo_hi as $uD) << n), - rem_1 as $uD - ) + (quo_0 as $uD) | ((quo_1 as $uD) << $n_h) | ((quo_hi as $uD) << n), + rem_1 as $uD, + ); } // relative leading significant bits, cannot overflow because of above branches @@ -237,13 +224,10 @@ macro_rules! impl_trifecta { (quo - 1) as $uD, // Both the addition and subtraction can overflow, but when combined end up // as a correct positive number. - duo.wrapping_add(div).wrapping_sub(tmp) - ) + duo.wrapping_add(div).wrapping_sub(tmp), + ); } else { - return ( - quo as $uD, - duo - tmp - ) + return (quo as $uD, duo - tmp); } } @@ -372,13 +356,10 @@ macro_rules! impl_trifecta { if duo < tmp { return ( quo + ((quo_part - 1) as $uD), - duo.wrapping_add(div).wrapping_sub(tmp) - ) + duo.wrapping_add(div).wrapping_sub(tmp), + ); } else { - return ( - quo + (quo_part as $uD), - duo - tmp - ) + return (quo + (quo_part as $uD), duo - tmp); } } @@ -387,15 +368,9 @@ macro_rules! impl_trifecta { if div_lz <= duo_lz { // quotient can have 0 or 1 added to it if div <= duo { - return ( - quo + 1, - duo - div - ) + return (quo + 1, duo - div); } else { - return ( - quo, - duo - ) + return (quo, duo); } } @@ -404,38 +379,9 @@ macro_rules! impl_trifecta { if n <= duo_lz { // simple division and addition let tmp = $half_division(duo as $uX, div as $uX); - return ( - quo + (tmp.0 as $uD), - tmp.1 as $uD - ) + return (quo + (tmp.0 as $uD), tmp.1 as $uD); } } } - - /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a - /// tuple. - $( - #[$signed_attr] - )* - pub fn $signed_name(duo: $iD, div: $iD) -> ($iD, $iD) { - match (duo < 0, div < 0) { - (false, false) => { - let t = $unsigned_name(duo as $uD, div as $uD); - (t.0 as $iD, t.1 as $iD) - }, - (true, false) => { - let t = $unsigned_name(duo.wrapping_neg() as $uD, div as $uD); - ((t.0 as $iD).wrapping_neg(), (t.1 as $iD).wrapping_neg()) - }, - (false, true) => { - let t = $unsigned_name(duo as $uD, div.wrapping_neg() as $uD); - ((t.0 as $iD).wrapping_neg(), t.1 as $iD) - }, - (true, true) => { - let t = $unsigned_name(duo.wrapping_neg() as $uD, div.wrapping_neg() as $uD); - (t.0 as $iD, (t.1 as $iD).wrapping_neg()) - }, - } - } - } + }; }