-
Notifications
You must be signed in to change notification settings - Fork 43
Rework quantile iteration logic #67
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
2da9f0a
2804a9f
3b1b3ba
0c5e264
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -142,7 +142,9 @@ impl<'a, T: 'a, P> Iterator for HistogramIterator<'a, T, P> | |
return None; | ||
} | ||
|
||
// have we yielded all non-zeros in the histogram? | ||
// TODO should check if we've reached max, not count, to avoid early termination | ||
// on histograms with very large counts whose total would exceed u64::max_value() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, that's a good point. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Though doesn't There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It absolutely does, but we could limit the damage to only saturating |
||
// Have we yielded all non-zeros in the histogram? | ||
let total = self.hist.count(); | ||
if self.prev_total_count == total { | ||
// is the picker done? | ||
|
@@ -163,7 +165,7 @@ impl<'a, T: 'a, P> Iterator for HistogramIterator<'a, T, P> | |
// if we've seen all counts, no other counts should be non-zero | ||
if self.total_count_to_index == total { | ||
// TODO this can fail when total count overflows | ||
assert!(count == T::zero()); | ||
assert_eq!(count, T::zero()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Got tired of IntelliJ warning me that this could be |
||
} | ||
|
||
// TODO overflow | ||
|
@@ -182,6 +184,7 @@ impl<'a, T: 'a, P> Iterator for HistogramIterator<'a, T, P> | |
// exposed to the same value again after yielding. not sure why this is the | ||
// behavior we want, but it's what the original Java implementation dictates. | ||
|
||
// TODO count starting at 0 each time we emit a value to be less prone to overflow | ||
self.prev_total_count = self.total_count_to_index; | ||
return Some(val); | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,8 +7,7 @@ pub struct Iter<'a, T: 'a + Counter> { | |
hist: &'a Histogram<T>, | ||
|
||
ticks_per_half_distance: u32, | ||
quantile_to_iterate_to: f64, | ||
reached_last_recorded_value: bool, | ||
quantile_to_iterate_to: f64 | ||
} | ||
|
||
impl<'a, T: 'a + Counter> Iter<'a, T> { | ||
|
@@ -21,8 +20,7 @@ impl<'a, T: 'a + Counter> Iter<'a, T> { | |
Iter { | ||
hist, | ||
ticks_per_half_distance, | ||
quantile_to_iterate_to: 0.0, | ||
reached_last_recorded_value: false, | ||
quantile_to_iterate_to: 0.0 | ||
}) | ||
} | ||
} | ||
|
@@ -42,6 +40,13 @@ impl<'a, T: 'a + Counter> PickyIterator<T> for Iter<'a, T> { | |
return false; | ||
} | ||
|
||
if self.quantile_to_iterate_to == 1.0 { | ||
// We incremented to 1.0 just at the point where we finally got to the last non-zero | ||
// bucket. We want to pick this value but not do the math below because it doesn't work | ||
// when quantile >= 1.0. | ||
return true; | ||
} | ||
|
||
// The choice to maintain fixed-sized "ticks" in each half-distance to 100% [starting from | ||
// 0%], as opposed to a "tick" size that varies with each interval, was made to make the | ||
// steps easily comprehensible and readable to humans. The resulting quantile steps are | ||
|
@@ -61,29 +66,27 @@ impl<'a, T: 'a + Counter> PickyIterator<T> for Iter<'a, T> { | |
// slice, traverse half to get to 50%. Then traverse half of the last (second) slice to get | ||
// to 75%, etc. | ||
// Minimum of 0 (1.0/1.0 = 1, log 2 of which is 0) so unsigned cast is safe. | ||
// Won't hit the `inf` case because quantile < 1.0, so this should yield an actual number. | ||
let num_halvings = (1.0 / (1.0 - self.quantile_to_iterate_to)).log2() as u32; | ||
// Calculate the total number of ticks in 0-1 given that half of each slice is tick'd. | ||
// The number of slices is 2 ^ num_halvings, and each slice has two "half distances" to | ||
// tick, so we add an extra power of two to get ticks per whole distance. | ||
// Use u64 math so that there's less risk of overflow with large numbers of ticks and data | ||
// that ends up needing large numbers of halvings. | ||
// TODO calculate the worst case total_ticks and make sure we can't ever overflow here | ||
let total_ticks = (self.ticks_per_half_distance as u64) | ||
.checked_mul(1_u64.checked_shl(num_halvings + 1).expect("too many halvings")) | ||
.expect("too many total ticks"); | ||
let increment_size = 1.0 / total_ticks as f64; | ||
self.quantile_to_iterate_to += increment_size; | ||
// Unclear if it's possible for adding a very small increment to 0.999999... to yield > 1.0 | ||
// but let's just be safe since FP is weird and this code is not likely to be very hot. | ||
self.quantile_to_iterate_to = 1.0_f64.min(self.quantile_to_iterate_to + increment_size); | ||
true | ||
} | ||
|
||
fn more(&mut self, _: usize) -> bool { | ||
// We want one additional last step to 100% | ||
if !self.reached_last_recorded_value && self.hist.count() != 0 { | ||
self.quantile_to_iterate_to = 1.0; | ||
self.reached_last_recorded_value = true; | ||
true | ||
} else { | ||
false | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the Java impl, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm... maybe the Java way is better and we should expose that as well? Java uses There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think having another value in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, I'll go ahead and add it. It will make the quantile iteration output a little easier to visually comprehend. |
||
// No need to go past the point where cumulative count == total count, because that's | ||
// quantile 1.0 and will be reported as such in the IterationValue, even if | ||
// `quantile_to_iterate_to` is somewhere below 1.0 -- we still got to the final bucket. | ||
false | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -200,6 +200,7 @@ extern crate num_traits as num; | |
|
||
use std::borrow::Borrow; | ||
use std::cmp; | ||
use std::fmt; | ||
use std::ops::{AddAssign, SubAssign}; | ||
use num::ToPrimitive; | ||
|
||
|
@@ -218,7 +219,7 @@ const ORIGINAL_MAX: u64 = 0; | |
/// Partial ordering is used for threshholding, also usually in the context of quantiles. | ||
pub trait Counter | ||
: num::Num + num::ToPrimitive + num::FromPrimitive + num::Saturating + num::CheckedSub | ||
+ num::CheckedAdd + Copy + PartialOrd<Self> { | ||
+ num::CheckedAdd + Copy + PartialOrd<Self> + fmt::Debug { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is to allow the use of |
||
|
||
/// Counter as a f64. | ||
fn as_f64(&self) -> f64; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Any particular reason you don't just want
let stdin = std::io::stdin().lock();
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The compiler gets grumpy about the lifetime of
stdin()
if I remove the intermediatelet
. Perhaps there is a better way though?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah.. This is at least a little nicer: