I’ve updated the qqmultinorm.R script a bit so that it’s now capable of picking a more optimal plot size (less unused space).
The idea refers to deciding the sides (dimensions) of a rectangle if the areal (number of plots) is known i.e., optimising the dimensions of a rectangle given the areal. We want the perimeter as small as possible (for better viewing), preferably wider than longer if square isn’t possible. A given number n is factorised in to numbers within a given error. For example if the allowed error is 2, then 7 gets factorised in (1,7), (2, 4), (4, 2), and (7, 1) (here redundant factorisation is included). Although 2*4 = 8, but abs(7-8) = 1 <= 2, so it's acceptable. Actually the default error is 10% of the input areal.
The way the proper factorisations is chosen, is by ordering the pairs by ascending the difference of the components i.e, how much the width and height differs. And then ordered ascending by height so that the plot gets wide screen-like instead of poster-like.
The new script is a follows (only find.optimal.mfrow.size(...) and a bit logic in qqmultinorm(...) added):
# File name: qqmultinorm.R
# Version: 1.1
# Last updated: 2009-08-28
#
# This R-code is made by:
# Mikkel Meyer Andersen, Denmark
# mikl [funny-a] math [.] aau [.] dk or
# mikl [funny-a] mikl [.] dk
#
# Licence: GPLv2
#
# Feel free to use it, but if you do I'll like to hear about it (just for fun).
# If you make corrections, please submit them back so others can enjoy them as well.
qqchisq <- function(y, main, df=2, continuity.correction = 0.5)
{
n <- length(y)
y <- sort(y)
c <- numeric(n)
for (i in 1:n)
c[i] <- qchisq((i-continuity.correction)/n, df=df)
plot(c, y, xlab="Theoretical Quantiles", ylab="Sample Quantiles", main=main)
lines(c(c[1], c[n]), c(c[1], c[n]), type="l")
}
dec2bin <- function(x)
{
if (!is.vector(x) || length(x) != 1 || x < 0)
stop("x must be a non-negative integer")
N <- length(x)
ndigits <- floor(log2(x)) + 1
bin <- numeric(ndigits)
for (i in (ndigits-1):0)
{
tmp <- 2^i
if (x %/% tmp >= 1)
{
bin[i+1] <- 1
x <- x - tmp
}
}
return(rev(bin))
}
# Returns the power set without the empty set
power.set <- function(v)
{
n <- length(v)
N <- 2^n - 1
ps <- vector("list", N)
for (i in 1:(N-1))
{
Nbin <- dec2bin(i)
Nbin <- c(numeric(n-length(Nbin)), Nbin)
Nbin <- rev(Nbin)
ps[[i]] <- v[which(Nbin == 1)]
}
ps[[N]] <- v
return(ps)
}
# Input: n
# - here the number of plots
# Returns c(h, w)
# - the optimal choice of rows and cols to use in in mfrow
find.optimal.mfrow.size <- function(n)
{
w0 <- ceiling(sqrt(n))
# The area can at max contain of 10% unused space
max.error <- round(n*0.1)
n.minus.error <- n - max.error
n.plus.error <- n + max.error
# If n is a square, fine!
if (w0^2 == n)
return(c(w0, w0))
# Col 1 and 2: w and h
# Col 3: The difference between w and h: this should be as small as possible
candidates <- matrix(ncol=3)
for (w in w0:1)
{
h <- ceiling(n / w)
n0 <- w*h
# Because of ceiling we know that n0 >= n
if (n0 <= n.plus.error)
candidates <- rbind(candidates, c(h, w, abs(w-h)))
}
# First row is NA
candidates <- candidates[-1,]
# Uups, something went wront - well, don't panic
if (nrow(candidates) == 0)
return(c(w0, w0))
# First order by abs(w-h) and then by h to get a widescreen-look
# instead of a poster-look
candidates <- candidates[order(candidates[,3], candidates[,1]), ]
return(candidates[1, c(1,2)])
}
# dataset: variables in columns and observations as rows
# subset.min.size, subset.max.size: inclusive limits
# filename: if specified, the plot are saved as a png file with this filename
qqmultinorm <- function(dataset, subset.min.size = 1, subset.max.size = 4, filename = NULL, use.optimale.size = F)
{
p <- ncol(dataset)
n <- nrow(dataset)
if (subset.min.size < 1) stop("subset.min.size < 1")
if (subset.min.size > p) stop("subset.min.size > p")
if (subset.max.size < 1) stop("subset.max.size < 1")
if (subset.max.size > p) stop("subset.max.size > p")
if (subset.min.size > subset.max.size) stop("subset.min.size > subset.max.size")
if (is.null(colnames(dataset)))
colnames(dataset) <- 1:p
# We have p variables. If all is to be checked against each other,
# then we have a power-set with 2^p subsets (including the empty set)
# Here we get subset containing indexes of the variables to include
# Note that power.set doesn't include the empty set.
subsets <- power.set(1:p)
subsets.len <- length(subsets)
# To get the plots with the fewest variables first, we do a litte trick:
# While we find out which plots to include, we build a list
# where each element of a list is the index of the subset,
# and the index of the element is the size of the subset.
# (The +1 is because the limits are includesive!)
s.included <- vector("list", subset.max.size - subset.min.size + 1)
plots <- 0
for (i in 1:subsets.len)
{
s <- subsets[[i]]
s.len <- length(s)
if (s.len >= subset.min.size && s.len <= subset.max.size)
{
plots <- plots + 1
s.included[[s.len - subset.min.size + 1]] <- c(s.included[[s.len - subset.min.size + 1]], i)
}
}
# Now it's possible to build the subset index vector;
# we disregard the size of each subset; no more need to know it.
s.indexes <- c()
for (s in s.included)
{
s.indexes <- c(s.indexes, s)
}
# We want the best view
plot.per.row <- ceiling(plots^(1/2))
plot.per.column <- ceiling(plots^(1/2))
if (use.optimale.size)
{
mfrow.parameters <- find.optimal.mfrow.size(plots)
plot.per.row <- mfrow.parameters[1]
plot.per.column <- mfrow.parameters[2]
}
plot.width <- 300
plot.height <- 200
if (!is.null(filename))
png(file=paste(filename, ".png", sep=""), bg="white", width = plot.per.row * plot.width, height = plot.per.column * plot.height)
par(mfrow = c(plot.per.row, plot.per.column))
# There's no need to calculate a whole lot several times:
ybar <- as.vector(colMeans(dataset))
S <- as.matrix(var(dataset))
current <- 1
for (i in s.indexes)
{
s <- subsets[[i]]
s.len <- length(s)
if (s.len > subset.max.size)
next
cat("Processing subset no.", current, "out of", plots, "\n")
# Container for our values
squared.dist <- numeric(n)
# qr.solve(A) = A^(-1)
Sinv <- qr.solve(S[s,s])
# Then calculate the squared distance for each datapoint
for (i in 1:n)
{
c <- dataset[i,s] - ybar[s]
squared.dist[i] <- t(c) %*% Sinv %*% c
}
# Finding the order statistic
squared.dist <- sort(squared.dist)
qqchisq(squared.dist, paste(colnames(dataset)[s], collapse=", "), s.len)
current <- current + 1
}
if (!is.null(filename))
dev.off()
}
# Example:
A <- matrix(rnorm(2000, mean=3, sd=2), ncol=8)
qqmultinorm(A, 2, 2, "multinorm-optimal.size", use.optimale.size = T)
qqmultinorm(A, 2, 2, "multinorm", use.optimale.size = F)