|
| 1 | +#!/usr/bin/env Rscript |
| 2 | +library(ggplot2); |
| 3 | +library(plyr); |
| 4 | + |
| 5 | +# get __dirname and load ./_cli.R |
| 6 | +args = commandArgs(trailingOnly = F); |
| 7 | +dirname = dirname(sub("--file=", "", args[grep("--file", args)])); |
| 8 | +source(paste0(dirname, '/_cli.R'), chdir=T); |
| 9 | + |
| 10 | +if (!is.null(args.options$help) || |
| 11 | + (!is.null(args.options$plot) && args.options$plot == TRUE)) { |
| 12 | + stop("usage: cat file.csv | Rscript bar.R |
| 13 | + --help show this message |
| 14 | + --plot filename save plot to filename"); |
| 15 | +} |
| 16 | + |
| 17 | +plot.filename = args.options$plot; |
| 18 | + |
| 19 | +dat = read.csv( |
| 20 | + file('stdin'), |
| 21 | + colClasses=c('character', 'character', 'character', 'numeric', 'numeric') |
| 22 | +); |
| 23 | +dat = data.frame(dat); |
| 24 | + |
| 25 | +dat$nameTwoLines = paste0(dat$filename, '\n', dat$configuration); |
| 26 | +dat$name = paste0(dat$filename, ' ', dat$configuration); |
| 27 | + |
| 28 | +# Create a box plot |
| 29 | +if (!is.null(plot.filename)) { |
| 30 | + p = ggplot(data=dat, aes(x=nameTwoLines, y=rate, fill=binary)); |
| 31 | + p = p + geom_bar(stat="summary", position=position_dodge()); |
| 32 | + p = p + ylab("rate of operations (higher is better)"); |
| 33 | + p = p + xlab("benchmark"); |
| 34 | + p = p + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)); |
| 35 | + ggsave(plot.filename, p); |
| 36 | +} |
| 37 | + |
| 38 | +# Computes the shared standard error, as used in Welch's t-test. |
| 39 | +welch.sd = function (old.rate, new.rate) { |
| 40 | + old.se.squared = var(old.rate) / length(old.rate) |
| 41 | + new.se.squared = var(new.rate) / length(new.rate) |
| 42 | + return(sqrt(old.se.squared + new.se.squared)) |
| 43 | +} |
| 44 | + |
| 45 | +# Calculate the improvement confidence interval. The improvement is calculated |
| 46 | +# by dividing by old.mu and not new.mu, because old.mu is what the mean |
| 47 | +# improvement is calculated relative to. |
| 48 | +confidence.interval = function (shared.se, old.mu, w, risk) { |
| 49 | + interval = qt(1 - (risk / 2), w$parameter) * shared.se; |
| 50 | + return(sprintf("±%.2f%%", (interval / old.mu) * 100)) |
| 51 | +} |
| 52 | + |
| 53 | +# Calculate the statistics table. |
| 54 | +statistics = ddply(dat, "name", function(subdat) { |
| 55 | + old.rate = subset(subdat, binary == "old")$rate; |
| 56 | + new.rate = subset(subdat, binary == "new")$rate; |
| 57 | + |
| 58 | + # Calculate improvement for the "new" binary compared with the "old" binary |
| 59 | + old.mu = mean(old.rate); |
| 60 | + new.mu = mean(new.rate); |
| 61 | + improvement = sprintf("%.2f %%", ((new.mu - old.mu) / old.mu * 100)); |
| 62 | + |
| 63 | + r = list( |
| 64 | + confidence = "NA", |
| 65 | + improvement = improvement, |
| 66 | + "accuracy (*)" = "NA", |
| 67 | + "(**)" = "NA", |
| 68 | + "(***)" = "NA" |
| 69 | + ); |
| 70 | + |
| 71 | + # Check if there is enough data to calculate the p-value. |
| 72 | + if (length(old.rate) > 1 && length(new.rate) > 1) { |
| 73 | + # Perform a statistical test to see if there actually is a difference in |
| 74 | + # performance. |
| 75 | + w = t.test(rate ~ binary, data=subdat); |
| 76 | + shared.se = welch.sd(old.rate, new.rate) |
| 77 | + |
| 78 | + # Add user-friendly stars to the table. There should be at least one star |
| 79 | + # before you can say that there is an improvement. |
| 80 | + confidence = ''; |
| 81 | + if (w$p.value < 0.001) { |
| 82 | + confidence = '***'; |
| 83 | + } else if (w$p.value < 0.01) { |
| 84 | + confidence = '**'; |
| 85 | + } else if (w$p.value < 0.05) { |
| 86 | + confidence = '*'; |
| 87 | + } |
| 88 | + |
| 89 | + r = list( |
| 90 | + confidence = confidence, |
| 91 | + improvement = improvement, |
| 92 | + "accuracy (*)" = confidence.interval(shared.se, old.mu, w, 0.05), |
| 93 | + "(**)" = confidence.interval(shared.se, old.mu, w, 0.01), |
| 94 | + "(***)" = confidence.interval(shared.se, old.mu, w, 0.001) |
| 95 | + ); |
| 96 | + } |
| 97 | + |
| 98 | + return(data.frame(r, check.names=FALSE)); |
| 99 | +}); |
| 100 | + |
| 101 | + |
| 102 | +# Set the benchmark names as the row.names to left align them in the print. |
| 103 | +row.names(statistics) = statistics$name; |
| 104 | +statistics$name = NULL; |
| 105 | + |
| 106 | +options(width = 200); |
| 107 | +print(statistics); |
| 108 | +cat("\n") |
| 109 | +cat(sprintf( |
| 110 | +"Be aware that when doing many comparisons the risk of a false-positive |
| 111 | +result increases. In this case, there are %d comparisons, you can thus |
| 112 | +expect the following amount of false-positive results: |
| 113 | + %.2f false positives, when considering a 5%% risk acceptance (*, **, ***), |
| 114 | + %.2f false positives, when considering a 1%% risk acceptance (**, ***), |
| 115 | + %.2f false positives, when considering a 0.1%% risk acceptance (***) |
| 116 | +", |
| 117 | +nrow(statistics), |
| 118 | +nrow(statistics) * 0.05, |
| 119 | +nrow(statistics) * 0.01, |
| 120 | +nrow(statistics) * 0.001)) |
0 commit comments