Skip to content

Commit ed3f782

Browse files
Bulk quantiles (#26)
* Promoted module to directory * Moved interpolate to separate file * Re-implemented quantile_axis_mut to get closer to something we can use for bulk computation * Use a set instead of a vec to avoid repeating computations * Use bulk method for single quantile * Implement bulk method to get sorted * Refactored quantiles_axis_mut to use sorted_get_many_mut * Avoid recomputing index value * Add quantiles_mut to 1d trait * Return hashmaps from bulk methods * Fixed tests * Use IndexSet to preserve insertion order * Fix indentation * IndexMap provides a more intuitive behaviour * Remove prints * Renamed methods * Docs for get_many_from_sorted_mut * Added docs for private free function * Docs for quantiles_mut * Fixed several typos in docs * More robust test * Added test for quantiles * Test quantiles_axis_mut * Add comments * Return options when the lane we are computing against is empty * Fixed docs * Fixed tests * Move *index* functions out of Interpolate trait The behavior of these functions should be independent of the interpolation strategy. * Reduce indentation in quantiles_axis_mut * Reduce indentation in quantile_axis_skipnan_mut * Use .into_scalar() method * Improve docs of partition_mut * Reformat quantiles_axis_mut * Cargo fmt * Fmt * Formatting * Log version works * Refactor * Fix indexes * Working implementation * Shorter syntax * Formatting * Better docs * Comments * Typo * Don't lose pieces after rebase * Fmt * Reduce code duplication * Fmt * Clarify docs of get_many_from_sorted_mut_unchecked * Add get_many_from_sorted_mut benchmark * Simplify get_many_from_sorted_mut_unchecked * Eliminate allocations from _get_many_from_sorted_mut_unchecked This improves performance, especially for small arrays. * Add get_from_sorted_mut benchmark * Call slice_axis_mut instead of slice_mut This has slightly lower overhead. * Replace iter::repeat with vec! This is a bit cleaner. * Fix typo in comment * Remove unnecessary type annotation * Simplify quantiles tests For me, the tests are easier to understand when they don't collect into a `Vec`. * Check keys in test_sorted_get_many_mut * Simplify sort tests * Improve sort and quantiles docs * Make Interpolate::interpolate operate elementwise * Make quantiles_* return Array instead of IndexMap * Add interpolate parameter to quantile* This has a few advantages: * It's now possible to save the interpolation strategy in a variable and easily re-use it. * We can now freely add more type parameters to the `quantile*` methods as needed without making them more difficult to call. * We now have the flexibility to add more advanced interpolation strategies in the future (e.g. one that wraps a closure). * Calling the `quantile*` methods is now slightly more compact because a turbofish isn't necessary. * Make get_many_from_sorted_mut take array of indexes This is slightly more versatile because `ArrayBase` allows arbitrary strides. * Make quantiles* take array instead of slice * Remove unnecessary IndexSet * Return EmptyInput instead of None * Fix tests * Match output type for argmin/max_skipnan * Fix tests * Fmt * Update src/lib.rs Co-Authored-By: LukeMathWalker <[email protected]> * Add quantile error * Renamed InvalidFraction to InvalidQuantile * Return QuantileError * Fix tests * Fix docs * Fmt * Simplify and deduplicate
1 parent 9e04bc0 commit ed3f782

File tree

10 files changed

+918
-281
lines changed

10 files changed

+918
-281
lines changed

Cargo.toml

+8-1
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,15 @@ num-integer = "0.1"
2121
num-traits = "0.2"
2222
rand = "0.6"
2323
itertools = { version = "0.7.0", default-features = false }
24+
indexmap = "1.0"
2425

2526
[dev-dependencies]
26-
quickcheck = "0.7"
27+
criterion = "0.2"
28+
quickcheck = { version = "0.8.1", default-features = false }
2729
ndarray-rand = "0.9"
2830
approx = "0.3"
31+
quickcheck_macros = "0.8"
32+
33+
[[bench]]
34+
name = "sort"
35+
harness = false

benches/sort.rs

+67
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
extern crate criterion;
2+
extern crate ndarray;
3+
extern crate ndarray_stats;
4+
extern crate rand;
5+
6+
use criterion::{
7+
black_box, criterion_group, criterion_main, AxisScale, BatchSize, Criterion,
8+
ParameterizedBenchmark, PlotConfiguration,
9+
};
10+
use ndarray::prelude::*;
11+
use ndarray_stats::Sort1dExt;
12+
use rand::prelude::*;
13+
14+
fn get_from_sorted_mut(c: &mut Criterion) {
15+
let lens = vec![10, 100, 1000, 10000];
16+
let benchmark = ParameterizedBenchmark::new(
17+
"get_from_sorted_mut",
18+
|bencher, &len| {
19+
let mut rng = StdRng::seed_from_u64(42);
20+
let mut data: Vec<_> = (0..len).collect();
21+
data.shuffle(&mut rng);
22+
let indices: Vec<_> = (0..len).step_by(len / 10).collect();
23+
bencher.iter_batched(
24+
|| Array1::from(data.clone()),
25+
|mut arr| {
26+
for &i in &indices {
27+
black_box(arr.get_from_sorted_mut(i));
28+
}
29+
},
30+
BatchSize::SmallInput,
31+
)
32+
},
33+
lens,
34+
)
35+
.plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic));
36+
c.bench("get_from_sorted_mut", benchmark);
37+
}
38+
39+
fn get_many_from_sorted_mut(c: &mut Criterion) {
40+
let lens = vec![10, 100, 1000, 10000];
41+
let benchmark = ParameterizedBenchmark::new(
42+
"get_many_from_sorted_mut",
43+
|bencher, &len| {
44+
let mut rng = StdRng::seed_from_u64(42);
45+
let mut data: Vec<_> = (0..len).collect();
46+
data.shuffle(&mut rng);
47+
let indices: Vec<_> = (0..len).step_by(len / 10).collect();
48+
bencher.iter_batched(
49+
|| Array1::from(data.clone()),
50+
|mut arr| {
51+
black_box(arr.get_many_from_sorted_mut(&indices));
52+
},
53+
BatchSize::SmallInput,
54+
)
55+
},
56+
lens,
57+
)
58+
.plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic));
59+
c.bench("get_many_from_sorted_mut", benchmark);
60+
}
61+
62+
criterion_group! {
63+
name = benches;
64+
config = Criterion::default();
65+
targets = get_from_sorted_mut, get_many_from_sorted_mut
66+
}
67+
criterion_main!(benches);

src/errors.rs

+29
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
//! Custom errors returned from our methods and functions.
2+
use noisy_float::types::N64;
23
use std::error::Error;
34
use std::fmt;
45

@@ -112,3 +113,31 @@ impl From<ShapeMismatch> for MultiInputError {
112113
MultiInputError::ShapeMismatch(err)
113114
}
114115
}
116+
117+
/// An error computing a quantile.
118+
#[derive(Debug, Clone, Eq, PartialEq)]
119+
pub enum QuantileError {
120+
/// The input was empty.
121+
EmptyInput,
122+
/// The `q` was not between `0.` and `1.` (inclusive).
123+
InvalidQuantile(N64),
124+
}
125+
126+
impl fmt::Display for QuantileError {
127+
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
128+
match self {
129+
QuantileError::EmptyInput => write!(f, "Empty input."),
130+
QuantileError::InvalidQuantile(q) => {
131+
write!(f, "{:} is not between 0. and 1. (inclusive).", q)
132+
}
133+
}
134+
}
135+
}
136+
137+
impl Error for QuantileError {}
138+
139+
impl From<EmptyInput> for QuantileError {
140+
fn from(_: EmptyInput) -> QuantileError {
141+
QuantileError::EmptyInput
142+
}
143+
}

src/histogram/strategies.rs

+3-2
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ use super::errors::BinsBuildError;
2424
use super::{Bins, Edges};
2525
use ndarray::prelude::*;
2626
use ndarray::Data;
27+
use noisy_float::types::n64;
2728
use num_traits::{FromPrimitive, NumOps, Zero};
2829

2930
/// A trait implemented by all strategies to build [`Bins`]
@@ -334,8 +335,8 @@ where
334335
}
335336

336337
let mut a_copy = a.to_owned();
337-
let first_quartile = a_copy.quantile_mut::<Nearest>(0.25).unwrap();
338-
let third_quartile = a_copy.quantile_mut::<Nearest>(0.75).unwrap();
338+
let first_quartile = a_copy.quantile_mut(n64(0.25), &Nearest).unwrap();
339+
let third_quartile = a_copy.quantile_mut(n64(0.75), &Nearest).unwrap();
339340
let iqr = third_quartile - first_quartile;
340341

341342
let bin_width = FreedmanDiaconis::compute_bin_width(n_points, iqr);

src/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
//! [`NumPy`]: https://docs.scipy.org/doc/numpy-1.14.1/reference/routines.statistics.html
2626
//! [`StatsBase.jl`]: https://juliastats.github.io/StatsBase.jl/latest/
2727
28+
extern crate indexmap;
2829
extern crate itertools;
2930
extern crate ndarray;
3031
extern crate noisy_float;

src/quantile/interpolate.rs

+138
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
//! Interpolation strategies.
2+
use noisy_float::types::N64;
3+
use num_traits::{Float, FromPrimitive, NumOps, ToPrimitive};
4+
5+
fn float_quantile_index(q: N64, len: usize) -> N64 {
6+
q * ((len - 1) as f64)
7+
}
8+
9+
/// Returns the fraction that the quantile is between the lower and higher indices.
10+
///
11+
/// This ranges from 0, where the quantile exactly corresponds the lower index,
12+
/// to 1, where the quantile exactly corresponds to the higher index.
13+
fn float_quantile_index_fraction(q: N64, len: usize) -> N64 {
14+
float_quantile_index(q, len).fract()
15+
}
16+
17+
/// Returns the index of the value on the lower side of the quantile.
18+
pub(crate) fn lower_index(q: N64, len: usize) -> usize {
19+
float_quantile_index(q, len).floor().to_usize().unwrap()
20+
}
21+
22+
/// Returns the index of the value on the higher side of the quantile.
23+
pub(crate) fn higher_index(q: N64, len: usize) -> usize {
24+
float_quantile_index(q, len).ceil().to_usize().unwrap()
25+
}
26+
27+
/// Used to provide an interpolation strategy to [`quantile_axis_mut`].
28+
///
29+
/// [`quantile_axis_mut`]: ../trait.QuantileExt.html#tymethod.quantile_axis_mut
30+
pub trait Interpolate<T> {
31+
/// Returns `true` iff the lower value is needed to compute the
32+
/// interpolated value.
33+
#[doc(hidden)]
34+
fn needs_lower(q: N64, len: usize) -> bool;
35+
36+
/// Returns `true` iff the higher value is needed to compute the
37+
/// interpolated value.
38+
#[doc(hidden)]
39+
fn needs_higher(q: N64, len: usize) -> bool;
40+
41+
/// Computes the interpolated value.
42+
///
43+
/// **Panics** if `None` is provided for the lower value when it's needed
44+
/// or if `None` is provided for the higher value when it's needed.
45+
#[doc(hidden)]
46+
fn interpolate(lower: Option<T>, higher: Option<T>, q: N64, len: usize) -> T;
47+
}
48+
49+
/// Select the higher value.
50+
pub struct Higher;
51+
/// Select the lower value.
52+
pub struct Lower;
53+
/// Select the nearest value.
54+
pub struct Nearest;
55+
/// Select the midpoint of the two values (`(lower + higher) / 2`).
56+
pub struct Midpoint;
57+
/// Linearly interpolate between the two values
58+
/// (`lower + (higher - lower) * fraction`, where `fraction` is the
59+
/// fractional part of the index surrounded by `lower` and `higher`).
60+
pub struct Linear;
61+
62+
impl<T> Interpolate<T> for Higher {
63+
fn needs_lower(_q: N64, _len: usize) -> bool {
64+
false
65+
}
66+
fn needs_higher(_q: N64, _len: usize) -> bool {
67+
true
68+
}
69+
fn interpolate(_lower: Option<T>, higher: Option<T>, _q: N64, _len: usize) -> T {
70+
higher.unwrap()
71+
}
72+
}
73+
74+
impl<T> Interpolate<T> for Lower {
75+
fn needs_lower(_q: N64, _len: usize) -> bool {
76+
true
77+
}
78+
fn needs_higher(_q: N64, _len: usize) -> bool {
79+
false
80+
}
81+
fn interpolate(lower: Option<T>, _higher: Option<T>, _q: N64, _len: usize) -> T {
82+
lower.unwrap()
83+
}
84+
}
85+
86+
impl<T> Interpolate<T> for Nearest {
87+
fn needs_lower(q: N64, len: usize) -> bool {
88+
float_quantile_index_fraction(q, len) < 0.5
89+
}
90+
fn needs_higher(q: N64, len: usize) -> bool {
91+
!<Self as Interpolate<T>>::needs_lower(q, len)
92+
}
93+
fn interpolate(lower: Option<T>, higher: Option<T>, q: N64, len: usize) -> T {
94+
if <Self as Interpolate<T>>::needs_lower(q, len) {
95+
lower.unwrap()
96+
} else {
97+
higher.unwrap()
98+
}
99+
}
100+
}
101+
102+
impl<T> Interpolate<T> for Midpoint
103+
where
104+
T: NumOps + Clone + FromPrimitive,
105+
{
106+
fn needs_lower(_q: N64, _len: usize) -> bool {
107+
true
108+
}
109+
fn needs_higher(_q: N64, _len: usize) -> bool {
110+
true
111+
}
112+
fn interpolate(lower: Option<T>, higher: Option<T>, _q: N64, _len: usize) -> T {
113+
let denom = T::from_u8(2).unwrap();
114+
let lower = lower.unwrap();
115+
let higher = higher.unwrap();
116+
lower.clone() + (higher.clone() - lower.clone()) / denom.clone()
117+
}
118+
}
119+
120+
impl<T> Interpolate<T> for Linear
121+
where
122+
T: NumOps + Clone + FromPrimitive + ToPrimitive,
123+
{
124+
fn needs_lower(_q: N64, _len: usize) -> bool {
125+
true
126+
}
127+
fn needs_higher(_q: N64, _len: usize) -> bool {
128+
true
129+
}
130+
fn interpolate(lower: Option<T>, higher: Option<T>, q: N64, len: usize) -> T {
131+
let fraction = float_quantile_index_fraction(q, len).to_f64().unwrap();
132+
let lower = lower.unwrap();
133+
let higher = higher.unwrap();
134+
let lower_f64 = lower.to_f64().unwrap();
135+
let higher_f64 = higher.to_f64().unwrap();
136+
lower.clone() + T::from_f64(fraction * (higher_f64 - lower_f64)).unwrap()
137+
}
138+
}

0 commit comments

Comments
 (0)