# File name: qqmultinorm.R
# Version: 1.1
# Last updated: 2009-08-28
#
# This R-code is made by:
# Mikkel Meyer Andersen, Denmark
# mikl [funny-a] math [.] aau [.] dk or
# mikl [funny-a] mikl [.] dk
#
# Licence: GPLv2
#
# Feel free to use it, but if you do I'll like to hear about it (just for fun).
# If you make corrections, please submit them back so others can enjoy them as well.
qqchisq <- function(y, main, df=2, continuity.correction = 0.5)
{
n <- length(y)
y <- sort(y)
c <- numeric(n)
for (i in 1:n)
c[i] <- qchisq((i-continuity.correction)/n, df=df)
plot(c, y, xlab="Theoretical Quantiles", ylab="Sample Quantiles", main=main)
lines(c(c[1], c[n]), c(c[1], c[n]), type="l")
}
dec2bin <- function(x)
{
if (!is.vector(x) || length(x) != 1 || x < 0)
stop("x must be a non-negative integer")
N <- length(x)
ndigits <- floor(log2(x)) + 1
bin <- numeric(ndigits)
for (i in (ndigits-1):0)
{
tmp <- 2^i
if (x %/% tmp >= 1)
{
bin[i+1] <- 1
x <- x - tmp
}
}
return(rev(bin))
}
# Returns the power set without the empty set
power.set <- function(v)
{
n <- length(v)
N <- 2^n - 1
ps <- vector("list", N)
for (i in 1:(N-1))
{
Nbin <- dec2bin(i)
Nbin <- c(numeric(n-length(Nbin)), Nbin)
Nbin <- rev(Nbin)
ps[[i]] <- v[which(Nbin == 1)]
}
ps[[N]] <- v
return(ps)
}
# Input: n
# - here the number of plots
# Returns c(h, w)
# - the optimal choice of rows and cols to use in in mfrow
find.optimal.mfrow.size <- function(n)
{
w0 <- ceiling(sqrt(n))
# The area can at max contain of 10% unused space
max.error <- round(n*0.1)
n.minus.error <- n - max.error
n.plus.error <- n + max.error
# If n is a square, fine!
if (w0^2 == n)
return(c(w0, w0))
# Col 1 and 2: w and h
# Col 3: The difference between w and h: this should be as small as possible
candidates <- matrix(ncol=3)
for (w in w0:1)
{
h <- ceiling(n / w)
n0 <- w*h
# Because of ceiling we know that n0 >= n
if (n0 <= n.plus.error)
candidates <- rbind(candidates, c(h, w, abs(w-h)))
}
# First row is NA
candidates <- candidates[-1,]
# Uups, something went wront - well, don't panic
if (nrow(candidates) == 0)
return(c(w0, w0))
# First order by abs(w-h) and then by h to get a widescreen-look
# instead of a poster-look
candidates <- candidates[order(candidates[,3], candidates[,1]), ]
return(candidates[1, c(1,2)])
}
# dataset: variables in columns and observations as rows
# subset.min.size, subset.max.size: inclusive limits
# filename: if specified, the plot are saved as a png file with this filename
qqmultinorm <- function(dataset, subset.min.size = 1, subset.max.size = 4, filename = NULL, use.optimale.size = F)
{
p <- ncol(dataset)
n <- nrow(dataset)
if (subset.min.size < 1) stop("subset.min.size < 1")
if (subset.min.size > p) stop("subset.min.size > p")
if (subset.max.size < 1) stop("subset.max.size < 1")
if (subset.max.size > p) stop("subset.max.size > p")
if (subset.min.size > subset.max.size) stop("subset.min.size > subset.max.size")
if (is.null(colnames(dataset)))
colnames(dataset) <- 1:p
# We have p variables. If all is to be checked against each other,
# then we have a power-set with 2^p subsets (including the empty set)
# Here we get subset containing indexes of the variables to include
# Note that power.set doesn't include the empty set.
subsets <- power.set(1:p)
subsets.len <- length(subsets)
# To get the plots with the fewest variables first, we do a litte trick:
# While we find out which plots to include, we build a list
# where each element of a list is the index of the subset,
# and the index of the element is the size of the subset.
# (The +1 is because the limits are includesive!)
s.included <- vector("list", subset.max.size - subset.min.size + 1)
plots <- 0
for (i in 1:subsets.len)
{
s <- subsets[[i]]
s.len <- length(s)
if (s.len >= subset.min.size && s.len <= subset.max.size)
{
plots <- plots + 1
s.included[[s.len - subset.min.size + 1]] <- c(s.included[[s.len - subset.min.size + 1]], i)
}
}
# Now it's possible to build the subset index vector;
# we disregard the size of each subset; no more need to know it.
s.indexes <- c()
for (s in s.included)
{
s.indexes <- c(s.indexes, s)
}
# We want the best view
plot.per.row <- ceiling(plots^(1/2))
plot.per.column <- ceiling(plots^(1/2))
if (use.optimale.size)
{
mfrow.parameters <- find.optimal.mfrow.size(plots)
plot.per.row <- mfrow.parameters[1]
plot.per.column <- mfrow.parameters[2]
}
plot.width <- 300
plot.height <- 200
if (!is.null(filename))
png(file=paste(filename, ".png", sep=""), bg="white", width = plot.per.row * plot.width, height = plot.per.column * plot.height)
par(mfrow = c(plot.per.row, plot.per.column))
# There's no need to calculate a whole lot several times:
ybar <- as.vector(colMeans(dataset))
S <- as.matrix(var(dataset))
current <- 1
for (i in s.indexes)
{
s <- subsets[[i]]
s.len <- length(s)
if (s.len > subset.max.size)
next
cat("Processing subset no.", current, "out of", plots, "\n")
# Container for our values
squared.dist <- numeric(n)
# qr.solve(A) = A^(-1)
Sinv <- qr.solve(S[s,s])
# Then calculate the squared distance for each datapoint
for (i in 1:n)
{
c <- dataset[i,s] - ybar[s]
squared.dist[i] <- t(c) %*% Sinv %*% c
}
# Finding the order statistic
squared.dist <- sort(squared.dist)
qqchisq(squared.dist, paste(colnames(dataset)[s], collapse=", "), s.len)
current <- current + 1
}
if (!is.null(filename))
dev.off()
}
# Example:
A <- matrix(rnorm(2000, mean=3, sd=2), ncol=8)
qqmultinorm(A, 2, 2, "multinorm-optimal.size", use.optimale.size = T)
qqmultinorm(A, 2, 2, "multinorm", use.optimale.size = F)