Skip to content

Commit b25b6fb

Browse files
committed
benchmark: add bar.R
1 parent e2e3f5c commit b25b6fb

File tree

2 files changed

+123
-1
lines changed

2 files changed

+123
-1
lines changed

benchmark/bar.R

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
#!/usr/bin/env Rscript
2+
library(ggplot2);
3+
library(plyr);
4+
5+
# get __dirname and load ./_cli.R
6+
args = commandArgs(trailingOnly = F);
7+
dirname = dirname(sub("--file=", "", args[grep("--file", args)]));
8+
source(paste0(dirname, '/_cli.R'), chdir=T);
9+
10+
if (!is.null(args.options$help) ||
11+
(!is.null(args.options$plot) && args.options$plot == TRUE)) {
12+
stop("usage: cat file.csv | Rscript bar.R
13+
--help show this message
14+
--plot filename save plot to filename");
15+
}
16+
17+
plot.filename = args.options$plot;
18+
19+
dat = read.csv(
20+
file('stdin'),
21+
colClasses=c('character', 'character', 'character', 'numeric', 'numeric')
22+
);
23+
dat = data.frame(dat);
24+
25+
dat$nameTwoLines = paste0(dat$filename, '\n', dat$configuration);
26+
dat$name = paste0(dat$filename, ' ', dat$configuration);
27+
28+
# Create a box plot
29+
if (!is.null(plot.filename)) {
30+
p = ggplot(data=dat, aes(x=nameTwoLines, y=rate, fill=binary));
31+
p = p + geom_bar(stat="summary", position=position_dodge());
32+
p = p + ylab("rate of operations (higher is better)");
33+
p = p + xlab("benchmark");
34+
p = p + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5));
35+
ggsave(plot.filename, p);
36+
}
37+
38+
# Computes the shared standard error, as used in Welch's t-test.
39+
welch.sd = function (old.rate, new.rate) {
40+
old.se.squared = var(old.rate) / length(old.rate)
41+
new.se.squared = var(new.rate) / length(new.rate)
42+
return(sqrt(old.se.squared + new.se.squared))
43+
}
44+
45+
# Calculate the improvement confidence interval. The improvement is calculated
46+
# by dividing by old.mu and not new.mu, because old.mu is what the mean
47+
# improvement is calculated relative to.
48+
confidence.interval = function (shared.se, old.mu, w, risk) {
49+
interval = qt(1 - (risk / 2), w$parameter) * shared.se;
50+
return(sprintf("±%.2f%%", (interval / old.mu) * 100))
51+
}
52+
53+
# Calculate the statistics table.
54+
statistics = ddply(dat, "name", function(subdat) {
55+
old.rate = subset(subdat, binary == "old")$rate;
56+
new.rate = subset(subdat, binary == "new")$rate;
57+
58+
# Calculate improvement for the "new" binary compared with the "old" binary
59+
old.mu = mean(old.rate);
60+
new.mu = mean(new.rate);
61+
improvement = sprintf("%.2f %%", ((new.mu - old.mu) / old.mu * 100));
62+
63+
r = list(
64+
confidence = "NA",
65+
improvement = improvement,
66+
"accuracy (*)" = "NA",
67+
"(**)" = "NA",
68+
"(***)" = "NA"
69+
);
70+
71+
# Check if there is enough data to calculate the p-value.
72+
if (length(old.rate) > 1 && length(new.rate) > 1) {
73+
# Perform a statistical test to see if there actually is a difference in
74+
# performance.
75+
w = t.test(rate ~ binary, data=subdat);
76+
shared.se = welch.sd(old.rate, new.rate)
77+
78+
# Add user-friendly stars to the table. There should be at least one star
79+
# before you can say that there is an improvement.
80+
confidence = '';
81+
if (w$p.value < 0.001) {
82+
confidence = '***';
83+
} else if (w$p.value < 0.01) {
84+
confidence = '**';
85+
} else if (w$p.value < 0.05) {
86+
confidence = '*';
87+
}
88+
89+
r = list(
90+
confidence = confidence,
91+
improvement = improvement,
92+
"accuracy (*)" = confidence.interval(shared.se, old.mu, w, 0.05),
93+
"(**)" = confidence.interval(shared.se, old.mu, w, 0.01),
94+
"(***)" = confidence.interval(shared.se, old.mu, w, 0.001)
95+
);
96+
}
97+
98+
return(data.frame(r, check.names=FALSE));
99+
});
100+
101+
102+
# Set the benchmark names as the row.names to left align them in the print.
103+
row.names(statistics) = statistics$name;
104+
statistics$name = NULL;
105+
106+
options(width = 200);
107+
print(statistics);
108+
cat("\n")
109+
cat(sprintf(
110+
"Be aware that when doing many comparisons the risk of a false-positive
111+
result increases. In this case, there are %d comparisons, you can thus
112+
expect the following amount of false-positive results:
113+
%.2f false positives, when considering a 5%% risk acceptance (*, **, ***),
114+
%.2f false positives, when considering a 1%% risk acceptance (**, ***),
115+
%.2f false positives, when considering a 0.1%% risk acceptance (***)
116+
",
117+
nrow(statistics),
118+
nrow(statistics) * 0.05,
119+
nrow(statistics) * 0.01,
120+
nrow(statistics) * 0.001))

doc/contributing/writing-and-running-benchmarks.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -293,7 +293,9 @@ module, you can use the `--filter` option:_
293293
```
294294

295295
For analyzing the benchmark results, use [node-benchmark-compare][] or the R
296-
script `benchmark/compare.R`.
296+
scripts:
297+
* `benchmark/compare.R`
298+
* `benchmark/bar.R`
297299

298300
```console
299301
$ node-benchmark-compare compare-pr-5134.csv # or cat compare-pr-5134.csv | Rscript benchmark/compare.R

0 commit comments

Comments
 (0)