Skip to content

Commit ba4eb03

Browse files
author
Marshall Pierce
authored
Merge pull request #67 from jonhoo/quantile-iter-end
Rework quantile iteration logic
2 parents f14bd3c + 0c5e264 commit ba4eb03

File tree

8 files changed

+249
-79
lines changed

8 files changed

+249
-79
lines changed

examples/cli.rs

Lines changed: 145 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,20 @@
33
extern crate hdrsample;
44
extern crate clap;
55

6-
use std::io::BufRead;
6+
use std::io;
7+
use std::io::{Write, BufRead};
8+
use std::fmt::Display;
79

810
use clap::{App, Arg, SubCommand};
911

10-
use hdrsample::Histogram;
11-
use hdrsample::serialization::{V2Serializer, V2DeflateSerializer};
12+
use hdrsample::{Histogram, RecordError};
13+
use hdrsample::serialization::{V2Serializer, V2SerializeError, V2DeflateSerializer, V2DeflateSerializeError, Deserializer, DeserializeError};
1214

1315
fn main() {
1416
let default_max = format!("{}", u64::max_value());
1517
let matches = App::new("hdrsample cli")
1618
.subcommand(SubCommand::with_name("serialize")
19+
.about("Transform number-per-line input from stdin into a serialized histogram on stdout")
1720
.arg(Arg::with_name("min")
1821
.long("min")
1922
.help("Minimum discernible value")
@@ -37,8 +40,26 @@ fn main() {
3740
.short("r")
3841
.long("resize")
3942
.help("Enable auto resize")))
43+
.subcommand(SubCommand::with_name("iter-quantiles")
44+
.about("Display quantiles to stdout from serialized histogram stdin")
45+
.arg(Arg::with_name("ticks")
46+
.short("t")
47+
.long("ticks-per-half")
48+
.takes_value(true)
49+
.required(true)
50+
.help("Ticks per half distance"))
51+
.arg(Arg::with_name("quantile-precision")
52+
.long("quantile-precision")
53+
.takes_value(true)
54+
.default_value("20")))
4055
.get_matches();
4156

57+
let stdin = std::io::stdin();
58+
let stdin = stdin.lock();
59+
60+
let stdout = std::io::stdout();
61+
let stdout = stdout.lock();
62+
4263
match matches.subcommand_name() {
4364
Some("serialize") => {
4465
let sub_matches = matches.subcommand_matches("serialize").unwrap();
@@ -52,28 +73,135 @@ fn main() {
5273
h.auto(true);
5374
}
5475

55-
serialize(h, sub_matches.is_present("compression"));
56-
},
76+
serialize(stdin, stdout, h, sub_matches.is_present("compression"))
77+
}
78+
Some("iter-quantiles") => {
79+
let sub_matches = matches.subcommand_matches("iter-quantiles").unwrap();
80+
let ticks_per_half = sub_matches.value_of("ticks").unwrap().parse().unwrap();
81+
let quantile_precision = sub_matches.value_of("quantile-precision").unwrap().parse().unwrap();
82+
quantiles(stdin, stdout, quantile_precision, ticks_per_half)
83+
}
5784
_ => unreachable!()
58-
}
85+
}.expect("Subcommand failed")
5986
}
6087

61-
fn serialize(mut h: Histogram<u64>, compression: bool) {
62-
let stdin = std::io::stdin();
63-
let stdin_handle = stdin.lock();
64-
65-
for num in stdin_handle.lines()
88+
/// Read numbers, one from each line, from stdin and output the resulting serialized histogram.
89+
fn serialize<R: BufRead, W: Write>(reader: R, mut writer: W, mut h: Histogram<u64>, compression: bool) -> Result<(), CliError> {
90+
for num in reader.lines()
6691
.map(|l| l.expect("Should be able to read stdin"))
6792
.map(|s| s.parse().expect("Each line must be a u64")) {
68-
h.record(num).unwrap();
93+
h.record(num)?;
6994
}
7095

71-
let stdout = std::io::stdout();
72-
let mut stdout_handle = stdout.lock();
73-
7496
if compression {
75-
V2DeflateSerializer::new().serialize(&h, &mut stdout_handle).unwrap();
97+
V2DeflateSerializer::new().serialize(&h, &mut writer)?;
7698
} else {
77-
V2Serializer::new().serialize(&h, &mut stdout_handle).unwrap();
99+
V2Serializer::new().serialize(&h, &mut writer)?;
100+
}
101+
102+
Ok(())
103+
}
104+
105+
/// Output histogram data in a format similar to the Java impl's
106+
/// `AbstractHistogram#outputPercentileDistribution`.
107+
fn quantiles<R: BufRead, W: Write>(mut reader: R, mut writer: W, quantile_precision: usize, ticks_per_half: u32) -> Result<(), CliError> {
108+
let hist: Histogram<u64> = Deserializer::new().deserialize(&mut reader)?;
109+
110+
writer.write_all(
111+
format!(
112+
"{:>12} {:>quantile_precision$} {:>quantile_precision$} {:>10} {:>14}\n\n",
113+
"Value",
114+
"QuantileValue",
115+
"QuantileIteration",
116+
"TotalCount",
117+
"1/(1-Quantile)",
118+
quantile_precision = quantile_precision + 2 // + 2 from leading "0." for numbers
119+
).as_ref(),
120+
)?;
121+
let mut sum = 0;
122+
for v in hist.iter_quantiles(ticks_per_half) {
123+
sum += v.count_since_last_iteration();
124+
if v.quantile() < 1.0 {
125+
writer.write_all(
126+
format!(
127+
"{:12} {:1.*} {:1.*} {:10} {:14.2}\n",
128+
v.value(),
129+
quantile_precision,
130+
v.quantile(),
131+
quantile_precision,
132+
v.quantile_iterated_to(),
133+
sum,
134+
1_f64 / (1_f64 - v.quantile())
135+
).as_ref(),
136+
)?;
137+
} else {
138+
writer.write_all(
139+
format!(
140+
"{:12} {:1.*} {:1.*} {:10} {:>14}\n",
141+
v.value(),
142+
quantile_precision,
143+
v.quantile(),
144+
quantile_precision,
145+
v.quantile_iterated_to(),
146+
sum,
147+
"∞"
148+
).as_ref(),
149+
)?;
150+
}
151+
}
152+
153+
fn write_extra_data<T1: Display, T2: Display, W: Write>(
154+
writer: &mut W, label1: &str, data1: T1, label2: &str, data2: T2) -> Result<(), io::Error> {
155+
writer.write_all(format!("#[{:10} = {:12.2}, {:14} = {:12.2}]\n",
156+
label1, data1, label2, data2).as_ref())
157+
}
158+
159+
write_extra_data(&mut writer, "Mean", hist.mean(), "StdDeviation", hist.stdev())?;
160+
write_extra_data(&mut writer, "Max", hist.max(), "Total count", hist.count())?;
161+
write_extra_data(&mut writer, "Buckets", hist.buckets(), "SubBuckets", hist.len())?;
162+
163+
Ok(())
164+
}
165+
166+
167+
// A handy way to enable ? use in subcommands by mapping common errors.
168+
// Normally I frown on excessive use of From as it's too "magic", but in the limited confines of
169+
// subcommands, the convenience seems worth it.
170+
#[derive(Debug)]
171+
enum CliError {
172+
IoError(io::Error),
173+
HistogramSerializeError(V2SerializeError),
174+
HistogramSerializeCompressedError(V2DeflateSerializeError),
175+
HistogramDeserializeError(DeserializeError),
176+
HistogramRecordError(RecordError)
177+
}
178+
179+
impl From<io::Error> for CliError {
180+
fn from(e: io::Error) -> Self {
181+
CliError::IoError(e)
182+
}
183+
}
184+
185+
impl From<V2SerializeError> for CliError {
186+
fn from(e: V2SerializeError) -> Self {
187+
CliError::HistogramSerializeError(e)
188+
}
189+
}
190+
191+
impl From<V2DeflateSerializeError> for CliError {
192+
fn from(e: V2DeflateSerializeError) -> Self {
193+
CliError::HistogramSerializeCompressedError(e)
194+
}
195+
}
196+
197+
impl From<RecordError> for CliError {
198+
fn from(e: RecordError) -> Self {
199+
CliError::HistogramRecordError(e)
200+
}
201+
}
202+
203+
impl From<DeserializeError> for CliError {
204+
fn from(e: DeserializeError) -> Self {
205+
CliError::HistogramDeserializeError(e)
78206
}
79207
}

src/iterators/all.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,4 +26,8 @@ impl<T: Counter> PickyIterator<T> for Iter {
2626
fn more(&mut self, _: usize) -> bool {
2727
true
2828
}
29+
30+
fn quantile_iterated_to(&self) -> Option<f64> {
31+
None
32+
}
2933
}

src/iterators/linear.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ impl<'a, T: 'a + Counter> Iter<'a, T> {
2020
assert!(value_units_per_bucket > 0, "value_units_per_bucket must be > 0");
2121
HistogramIterator::new(hist,
2222
Iter {
23-
hist: hist,
24-
value_units_per_bucket: value_units_per_bucket,
23+
hist,
24+
value_units_per_bucket,
2525
// won't underflow because value_units_per_bucket > 0
2626
current_step_highest_value_reporting_level: value_units_per_bucket - 1,
2727
current_step_lowest_value_reporting_level:
@@ -51,4 +51,8 @@ impl<'a, T: 'a + Counter> PickyIterator<T> for Iter<'a, T> {
5151
// TODO index + 1 could overflow 16-bit usize
5252
self.current_step_highest_value_reporting_level + 1 < self.hist.value_for(index + 1)
5353
}
54+
55+
fn quantile_iterated_to(&self) -> Option<f64> {
56+
None
57+
}
5458
}

src/iterators/log.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@ impl<'a, T: 'a + Counter> Iter<'a, T> {
2525
assert!(log_base > 1.0, "log_base must be > 1.0");
2626
HistogramIterator::new(hist,
2727
Iter {
28-
hist: hist,
29-
log_base: log_base,
28+
hist,
29+
log_base,
3030
next_value_reporting_level: value_units_in_first_bucket as f64,
3131
current_step_highest_value_reporting_level: value_units_in_first_bucket -
3232
1,
@@ -60,4 +60,8 @@ impl<'a, T: 'a + Counter> PickyIterator<T> for Iter<'a, T> {
6060
self.hist.lowest_equivalent(self.next_value_reporting_level as u64) <
6161
self.hist.value_for(next_index)
6262
}
63+
64+
fn quantile_iterated_to(&self) -> Option<f64> {
65+
None
66+
}
6367
}

src/iterators/mod.rs

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,11 @@ pub trait PickyIterator<T: Counter> {
2323
fn pick(&mut self, index: usize, total_count_to_index: u64) -> bool;
2424
/// should we keep iterating even though all future indices are zeros?
2525
fn more(&mut self, index: usize) -> bool;
26+
27+
/// Supply the quantile iterated to in the last `pick()`, if available. If `None` is returned,
28+
/// the quantile of the current value will be used instead. Probably only useful for the
29+
/// quantile iterator.
30+
fn quantile_iterated_to(&self) -> Option<f64>;
2631
}
2732

2833
/// `HistogramIterator` provides a base iterator for a `Histogram`.
@@ -53,43 +58,49 @@ pub struct HistogramIterator<'a, T: 'a + Counter, P: PickyIterator<T>> {
5358
pub struct IterationValue<T: Counter> {
5459
value: u64,
5560
quantile: f64,
61+
quantile_iterated_to: f64,
5662
count_at_value: T,
5763
count_since_last_iteration: u64
5864
}
5965

6066
impl<T: Counter> IterationValue<T> {
6167
/// Create a new IterationValue.
62-
pub fn new(value: u64, quantile: f64, count_at_value: T, count_since_last_iteration: u64)
63-
-> IterationValue<T> {
68+
pub fn new(value: u64, quantile: f64, quantile_iterated_to: f64, count_at_value: T,
69+
count_since_last_iteration: u64) -> IterationValue<T> {
6470
IterationValue {
6571
value,
6672
quantile,
73+
quantile_iterated_to,
6774
count_at_value,
6875
count_since_last_iteration
6976
}
7077
}
7178

72-
/// the lowest value stored in the current histogram bin
79+
/// The lowest value stored in the current histogram bin
7380
pub fn value(&self) -> u64 {
7481
self.value
7582
}
7683

77-
/// percent of recorded values that are equivalent to or below `value`.
84+
/// Percent of recorded values that are equivalent to or below `value`.
7885
/// This is simply the quantile multiplied by 100.0, so if you care about maintaining the best
7986
/// floating-point precision, use `quantile()` instead.
8087
pub fn percentile(&self) -> f64 {
8188
self.quantile * 100.0
8289
}
8390

84-
/// quantile of recorded values that are equivalent to or below `value`
91+
/// Quantile of recorded values that are equivalent to or below `value`
8592
pub fn quantile(&self) -> f64 { self.quantile }
8693

87-
/// recorded count for values equivalent to `value`
94+
/// Quantile iterated to, which in the case of quantile iteration may be different from
95+
/// `quantile` because slightly different quantiles can still map to the same bucket.
96+
pub fn quantile_iterated_to(&self) -> f64 { self.quantile_iterated_to }
97+
98+
/// Recorded count for values equivalent to `value`
8899
pub fn count_at_value(&self) -> T {
89100
self.count_at_value
90101
}
91102

92-
/// number of values traversed since the last iteration step
103+
/// Number of values traversed since the last iteration step
93104
pub fn count_since_last_iteration(&self) -> u64 {
94105
self.count_since_last_iteration
95106
}
@@ -109,9 +120,11 @@ impl<'a, T: Counter, P: PickyIterator<T>> HistogramIterator<'a, T, P> {
109120
}
110121

111122
fn current(&self) -> IterationValue<T> {
123+
let quantile = self.total_count_to_index as f64 / self.hist.count() as f64;
112124
IterationValue {
113125
value: self.hist.highest_equivalent(self.hist.value_for(self.current_index)),
114-
quantile: self.total_count_to_index as f64 / self.hist.count() as f64,
126+
quantile,
127+
quantile_iterated_to: self.picker.quantile_iterated_to().unwrap_or(quantile),
115128
count_at_value: self.hist.count_at_index(self.current_index)
116129
.expect("current index cannot exceed counts length"),
117130
count_since_last_iteration: self.total_count_to_index - self.prev_total_count
@@ -142,7 +155,9 @@ impl<'a, T: 'a, P> Iterator for HistogramIterator<'a, T, P>
142155
return None;
143156
}
144157

145-
// have we yielded all non-zeros in the histogram?
158+
// TODO should check if we've reached max, not count, to avoid early termination
159+
// on histograms with very large counts whose total would exceed u64::max_value()
160+
// Have we yielded all non-zeros in the histogram?
146161
let total = self.hist.count();
147162
if self.prev_total_count == total {
148163
// is the picker done?
@@ -163,7 +178,7 @@ impl<'a, T: 'a, P> Iterator for HistogramIterator<'a, T, P>
163178
// if we've seen all counts, no other counts should be non-zero
164179
if self.total_count_to_index == total {
165180
// TODO this can fail when total count overflows
166-
assert!(count == T::zero());
181+
assert_eq!(count, T::zero());
167182
}
168183

169184
// TODO overflow
@@ -182,6 +197,7 @@ impl<'a, T: 'a, P> Iterator for HistogramIterator<'a, T, P>
182197
// exposed to the same value again after yielding. not sure why this is the
183198
// behavior we want, but it's what the original Java implementation dictates.
184199

200+
// TODO count starting at 0 each time we emit a value to be less prone to overflow
185201
self.prev_total_count = self.total_count_to_index;
186202
return Some(val);
187203
}

0 commit comments

Comments
 (0)