benchmark/compare.R - external/github.com/v8/node - Git at Google

 #!/usr/bin/env Rscript
 library(ggplot2);
 library(plyr);

 # get __dirname and load ./_cli.R
 args = commandArgs(trailingOnly = F);
 dirname = dirname(sub("--file=", "", args[grep("--file", args)]));
 source(paste0(dirname, '/_cli.R'), chdir=T);

 if (!is.null(args.options$help) ||
    (!is.null(args.options$plot) && args.options$plot == TRUE)) {
   stop("usage: cat file.csv | Rscript compare.R
   --help           show this message
   --plot filename  save plot to filename");
 }

 plot.filename = args.options$plot;

 dat = read.csv(
   file('stdin'),
   colClasses=c('character', 'character', 'character', 'numeric', 'numeric')
 );
 dat = data.frame(dat);

 dat$nameTwoLines = paste0(dat$filename, '\n', dat$configuration);
 dat$name = paste0(dat$filename, dat$configuration);

 # Create a box plot
 if (!is.null(plot.filename)) {
   p = ggplot(data=dat);
   p = p + geom_boxplot(aes(x=nameTwoLines, y=rate, fill=binary));
   p = p + ylab("rate of operations (higher is better)");
   p = p + xlab("benchmark");
   p = p + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5));
   ggsave(plot.filename, p);
 }

 # computes the shared standard error, as used in the welch t-test
 welch.sd = function (old.rate, new.rate) {
   old.se.squared = var(old.rate) / length(old.rate)
   new.se.squared = var(new.rate) / length(new.rate)
   return(sqrt(old.se.squared + new.se.squared))
 }

 # calculate the improvement confidence interval. The improvement is calculated
 # by dividing by old.mu and not new.mu, because old.mu is what the mean
 # improvement is calculated relative to.
 confidence.interval = function (shared.se, old.mu, w, risk) {
   interval = qt(1 - (risk / 2), w$parameter) * shared.se;
   return(sprintf("±%.2f%%", (interval / old.mu) * 100))
 }

 # Print a table with results
 statistics = ddply(dat, "name", function(subdat) {
   old.rate = subset(subdat, binary == "old")$rate;
   new.rate = subset(subdat, binary == "new")$rate;

   # Calculate improvement for the "new" binary compared with the "old" binary
   old.mu = mean(old.rate);
   new.mu = mean(new.rate);
   improvement = sprintf("%.2f %%", ((new.mu - old.mu) / old.mu * 100));

   r = list(
     confidence = "NA",
     improvement = improvement,
     "accuracy (*)" = "NA",
     "(**)" = "NA",
     "(***)" = "NA"
   );

   # Check if there is enough data to calculate the calculate the p-value
   if (length(old.rate) > 1 && length(new.rate) > 1) {
     # Perform a statistics test to see of there actually is a difference in
     # performance.
     w = t.test(rate ~ binary, data=subdat);
     shared.se = welch.sd(old.rate, new.rate)

     # Add user friendly stars to the table. There should be at least one star
     # before you can say that there is an improvement.
     confidence = '';
     if (w$p.value < 0.001) {
       confidence = '***';
     } else if (w$p.value < 0.01) {
       confidence = '**';
     } else if (w$p.value < 0.05) {
       confidence = '*';
     }

     r = list(
       confidence = confidence,
       improvement = improvement,
       "accuracy (*)" = confidence.interval(shared.se, old.mu, w, 0.05),
       "(**)" = confidence.interval(shared.se, old.mu, w, 0.01),
       "(***)" = confidence.interval(shared.se, old.mu, w, 0.001)
     );
   }

   return(data.frame(r, check.names=FALSE));
 });


 # Set the benchmark names as the row.names to left align them in the print
 row.names(statistics) = statistics$name;
 statistics$name = NULL;

 options(width = 200);
 print(statistics);
 cat("\n")
 cat(sprintf(
 "Be aware that when doing many comparisons the risk of a false-positive
 result increases. In this case there are %d comparisons, you can thus
 expect the following amount of false-positive results:
   %.2f false positives, when considering a   5%% risk acceptance (*, **, ***),
   %.2f false positives, when considering a   1%% risk acceptance (**, ***),
   %.2f false positives, when considering a 0.1%% risk acceptance (***)
 ",
 nrow(statistics),
 nrow(statistics) * 0.05,
 nrow(statistics) * 0.01,
 nrow(statistics) * 0.001))
	#!/usr/bin/env Rscript
	library(ggplot2);
	library(plyr);

	# get __dirname and load ./_cli.R
	args = commandArgs(trailingOnly = F);
	dirname = dirname(sub("--file=", "", args[grep("--file", args)]));
	source(paste0(dirname, '/_cli.R'), chdir=T);

	if (!is.null(args.options$help) \|\|
	(!is.null(args.options$plot) && args.options$plot == TRUE)) {
	stop("usage: cat file.csv \| Rscript compare.R
	--help show this message
	--plot filename save plot to filename");
	}

	plot.filename = args.options$plot;

	dat = read.csv(
	file('stdin'),
	colClasses=c('character', 'character', 'character', 'numeric', 'numeric')
	);
	dat = data.frame(dat);

	dat$nameTwoLines = paste0(dat$filename, '\n', dat$configuration);
	dat$name = paste0(dat$filename, dat$configuration);

	# Create a box plot
	if (!is.null(plot.filename)) {
	p = ggplot(data=dat);
	p = p + geom_boxplot(aes(x=nameTwoLines, y=rate, fill=binary));
	p = p + ylab("rate of operations (higher is better)");
	p = p + xlab("benchmark");
	p = p + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5));
	ggsave(plot.filename, p);
	}

	# computes the shared standard error, as used in the welch t-test
	welch.sd = function (old.rate, new.rate) {
	old.se.squared = var(old.rate) / length(old.rate)
	new.se.squared = var(new.rate) / length(new.rate)
	return(sqrt(old.se.squared + new.se.squared))
	}

	# calculate the improvement confidence interval. The improvement is calculated
	# by dividing by old.mu and not new.mu, because old.mu is what the mean
	# improvement is calculated relative to.
	confidence.interval = function (shared.se, old.mu, w, risk) {
	interval = qt(1 - (risk / 2), w$parameter) * shared.se;
	return(sprintf("±%.2f%%", (interval / old.mu) * 100))
	}

	# Print a table with results
	statistics = ddply(dat, "name", function(subdat) {
	old.rate = subset(subdat, binary == "old")$rate;
	new.rate = subset(subdat, binary == "new")$rate;

	# Calculate improvement for the "new" binary compared with the "old" binary
	old.mu = mean(old.rate);
	new.mu = mean(new.rate);
	improvement = sprintf("%.2f %%", ((new.mu - old.mu) / old.mu * 100));

	r = list(
	confidence = "NA",
	improvement = improvement,
	"accuracy (*)" = "NA",
	"(**)" = "NA",
	"(***)" = "NA"
	);

	# Check if there is enough data to calculate the calculate the p-value
	if (length(old.rate) > 1 && length(new.rate) > 1) {
	# Perform a statistics test to see of there actually is a difference in
	# performance.
	w = t.test(rate ~ binary, data=subdat);
	shared.se = welch.sd(old.rate, new.rate)

	# Add user friendly stars to the table. There should be at least one star
	# before you can say that there is an improvement.
	confidence = '';
	if (w$p.value < 0.001) {
	confidence = '***';
	} else if (w$p.value < 0.01) {
	confidence = '**';
	} else if (w$p.value < 0.05) {
	confidence = '*';
	}

	r = list(
	confidence = confidence,
	improvement = improvement,
	"accuracy (*)" = confidence.interval(shared.se, old.mu, w, 0.05),
	"(**)" = confidence.interval(shared.se, old.mu, w, 0.01),
	"(***)" = confidence.interval(shared.se, old.mu, w, 0.001)
	);
	}

	return(data.frame(r, check.names=FALSE));
	});


	# Set the benchmark names as the row.names to left align them in the print
	row.names(statistics) = statistics$name;
	statistics$name = NULL;

	options(width = 200);
	print(statistics);
	cat("\n")
	cat(sprintf(
	"Be aware that when doing many comparisons the risk of a false-positive
	result increases. In this case there are %d comparisons, you can thus
	expect the following amount of false-positive results:
	%.2f false positives, when considering a 5%% risk acceptance (, , **),
	%.2f false positives, when considering a 1%% risk acceptance (, *),
	%.2f false positives, when considering a 0.1%% risk acceptance (***)
	",
	nrow(statistics),
	nrow(statistics) * 0.05,
	nrow(statistics) * 0.01,
	nrow(statistics) * 0.001))