| #!/usr/bin/env Rscript |
| library(ggplot2); |
| library(plyr); |
| |
| # get __dirname and load ./_cli.R |
| args = commandArgs(trailingOnly = F); |
| dirname = dirname(sub("--file=", "", args[grep("--file", args)])); |
| source(paste0(dirname, '/_cli.R'), chdir=T); |
| |
| if (!is.null(args.options$help) || |
| (!is.null(args.options$plot) && args.options$plot == TRUE)) { |
| stop("usage: cat file.csv | Rscript compare.R |
| --help show this message |
| --plot filename save plot to filename"); |
| } |
| |
| plot.filename = args.options$plot; |
| |
| dat = read.csv( |
| file('stdin'), |
| colClasses=c('character', 'character', 'character', 'numeric', 'numeric') |
| ); |
| dat = data.frame(dat); |
| |
| dat$nameTwoLines = paste0(dat$filename, '\n', dat$configuration); |
| dat$name = paste0(dat$filename, dat$configuration); |
| |
| # Create a box plot |
| if (!is.null(plot.filename)) { |
| p = ggplot(data=dat); |
| p = p + geom_boxplot(aes(x=nameTwoLines, y=rate, fill=binary)); |
| p = p + ylab("rate of operations (higher is better)"); |
| p = p + xlab("benchmark"); |
| p = p + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)); |
| ggsave(plot.filename, p); |
| } |
| |
| # computes the shared standard error, as used in the welch t-test |
| welch.sd = function (old.rate, new.rate) { |
| old.se.squared = var(old.rate) / length(old.rate) |
| new.se.squared = var(new.rate) / length(new.rate) |
| return(sqrt(old.se.squared + new.se.squared)) |
| } |
| |
| # calculate the improvement confidence interval. The improvement is calculated |
| # by dividing by old.mu and not new.mu, because old.mu is what the mean |
| # improvement is calculated relative to. |
| confidence.interval = function (shared.se, old.mu, w, risk) { |
| interval = qt(1 - (risk / 2), w$parameter) * shared.se; |
| return(sprintf("±%.2f%%", (interval / old.mu) * 100)) |
| } |
| |
| # Print a table with results |
| statistics = ddply(dat, "name", function(subdat) { |
| old.rate = subset(subdat, binary == "old")$rate; |
| new.rate = subset(subdat, binary == "new")$rate; |
| |
| # Calculate improvement for the "new" binary compared with the "old" binary |
| old.mu = mean(old.rate); |
| new.mu = mean(new.rate); |
| improvement = sprintf("%.2f %%", ((new.mu - old.mu) / old.mu * 100)); |
| |
| r = list( |
| confidence = "NA", |
| improvement = improvement, |
| "accuracy (*)" = "NA", |
| "(**)" = "NA", |
| "(***)" = "NA" |
| ); |
| |
| # Check if there is enough data to calculate the calculate the p-value |
| if (length(old.rate) > 1 && length(new.rate) > 1) { |
| # Perform a statistics test to see of there actually is a difference in |
| # performance. |
| w = t.test(rate ~ binary, data=subdat); |
| shared.se = welch.sd(old.rate, new.rate) |
| |
| # Add user friendly stars to the table. There should be at least one star |
| # before you can say that there is an improvement. |
| confidence = ''; |
| if (w$p.value < 0.001) { |
| confidence = '***'; |
| } else if (w$p.value < 0.01) { |
| confidence = '**'; |
| } else if (w$p.value < 0.05) { |
| confidence = '*'; |
| } |
| |
| r = list( |
| confidence = confidence, |
| improvement = improvement, |
| "accuracy (*)" = confidence.interval(shared.se, old.mu, w, 0.05), |
| "(**)" = confidence.interval(shared.se, old.mu, w, 0.01), |
| "(***)" = confidence.interval(shared.se, old.mu, w, 0.001) |
| ); |
| } |
| |
| return(data.frame(r, check.names=FALSE)); |
| }); |
| |
| |
| # Set the benchmark names as the row.names to left align them in the print |
| row.names(statistics) = statistics$name; |
| statistics$name = NULL; |
| |
| options(width = 200); |
| print(statistics); |
| cat("\n") |
| cat(sprintf( |
| "Be aware that when doing many comparisons the risk of a false-positive |
| result increases. In this case there are %d comparisons, you can thus |
| expect the following amount of false-positive results: |
| %.2f false positives, when considering a 5%% risk acceptance (*, **, ***), |
| %.2f false positives, when considering a 1%% risk acceptance (**, ***), |
| %.2f false positives, when considering a 0.1%% risk acceptance (***) |
| ", |
| nrow(statistics), |
| nrow(statistics) * 0.05, |
| nrow(statistics) * 0.01, |
| nrow(statistics) * 0.001)) |