qqmultinorm.R – evaluating the normality of a sample

icon3 ,
1 Star2 Stars3 Stars4 Stars5 Stars (No Ratings Yet)
Loading ... Loading ...
Posted August 27, 2009 at 11:44 (UTC)

I wrote a R-script that can be used to evaluate for multivariate normality of a sample. It’s a kind of generalisation to the qqnorm, but this one just uses sums of the squared statistical distance which is then chi squared distributed with degrees of freedom equalling the number of squared statistical distance summed.

The useful thing about this script, is that it’s able to plot all possible combinations of the variables. It’s possible to specify the minimum number of variables to compare and the maximum number of variables to compare. All possible combinations are found by calculating the power set (using binary representation through the decimal to binary conversion, dec2bin, because if a set has k elements, then its power set has 2^k elements, and the binary representation of the numbers from 1 to 2^k are then used to pick out the subsets in the power set).

Please notice that it’s possible to specify a filename so that the plots are written to a png file. This is by far the easiest thing – and only possibility – if there’s more than a few plots.

I haven’t wrote a lot of documentation besides this, but feel free to ask if in doubt of anything!

# File name: qqmultinorm.R
#
# This R-code is made by:
# Mikkel Meyer Andersen, Denmark
# mikl [funny-a] math [.] aau [.] dk or 
# mikl [funny-a] mikl [.] dk
#
# Licence: GPLv2
#
# Feel free to use it, but if you do I'll like to hear about it (just for fun).
# If you make corrections, please submit them back so others can enjoy them as well.
 
qqchisq <- function(y, main, df=2, continuity.correction = 0.5)
{
  n <- length(y)
  y <- sort(y)
  c <- numeric(n)
 
  for (i in 1:n)
    c[i] <- qchisq((i-continuity.correction)/n, df=df)
 
  plot(c, y, xlab="Theoretical Quantiles", ylab="Sample Quantiles", main=main)
  lines(c(c[1], c[n]), c(c[1], c[n]), type="l")
}
 
dec2bin <- function(x)
{
  if (!is.vector(x) || length(x) != 1 || x < 0)
    stop("x must be a non-negative integer")
 
  N <- length(x)
  ndigits <- floor(log2(x)) + 1
  bin <- numeric(ndigits)
 
  for (i in (ndigits-1):0)
  {
    tmp <- 2^i
 
    if (x %/% tmp >= 1)
    {
      bin[i+1] <- 1
      x <- x - tmp
    }
  }
 
  return(rev(bin))
}
 
# Returns the power set without the empty set
power.set <- function(v)
{
  n <- length(v)  
  N <- 2^n - 1
  ps <- vector("list", N)
 
  for (i in 1:(N-1))
  {
    Nbin <- dec2bin(i)
    Nbin <- c(numeric(n-length(Nbin)), Nbin)
    Nbin <- rev(Nbin)
    ps[[i]] <- v[which(Nbin == 1)]
  }
 
  ps[[N]] <- v
 
  return(ps)
}
 
# dataset: variables in columns and observations as rows
# subset.min.size, subset.max.size: inclusive limits
# filename: if specified, the plot are saved as a png file with this filename
qqmultinorm <- function(dataset, subset.min.size = 1, subset.max.size = 4, filename = NULL)
{
  p <- ncol(dataset)
  n <- nrow(dataset)
 
  if (subset.min.size < 1) stop("subset.min.size < 1")
  if (subset.min.size > p) stop("subset.min.size > p")
  if (subset.max.size < 1) stop("subset.max.size < 1")
  if (subset.max.size > p) stop("subset.max.size > p")
  if (subset.min.size > subset.max.size) stop("subset.min.size > subset.max.size")
 
  if (is.null(colnames(dataset)))
    colnames(dataset) <- 1:p
 
  # We have p variables. If all is to be checked against each other,
  # then we have a power-set with 2^p subsets (including the empty set)
 
  # Here we get subset containing indexes of the variables to include
  # Note that power.set doesn't include the empty set.
  subsets <- power.set(1:p)
  subsets.len <- length(subsets)
 
  # To get the plots with the fewest variables first, we do a litte trick:
  # While we find out which plots to include, we build a list
  # where each element of a list is the index of the subset,
  # and the index of the element is the size of the subset.
  # (The +1 is because the limits are includesive!)
  s.included <- vector("list", subset.max.size - subset.min.size + 1)
 
  plots <- 0
 
  for (i in 1:subsets.len)
  {
    s <- subsets[[i]]
    s.len <- length(s)
 
    if (s.len >= subset.min.size && s.len <= subset.max.size)
    {
      plots <- plots + 1
      s.included[[s.len - subset.min.size + 1]] <- c(s.included[[s.len - subset.min.size + 1]], i)
    }
  }
 
  # Now it's possible to build the subset index vector; 
  # we disregard the size of each subset; no more need to know it.
  s.indexes <- c()
 
  for (s in s.included)
  {
    s.indexes <- c(s.indexes, s)
  }
 
  # We want a squared view
  plot.per.row <- ceiling(plots^(1/2))
  plot.per.column <- ceiling(plots^(1/2))
  plot.width <- 200
  plot.height <- 200
 
  if (!is.null(filename))
    png(file=paste(filename, ".png", sep=""), bg="white", width = plot.per.row * plot.width, height = plot.per.column * plot.height)
 
  par(mfrow = c(plot.per.row, plot.per.column))  
 
  # There's no need to calculate a whole lot several times:
  ybar <- as.vector(colMeans(dataset))
 
  S <- as.matrix(var(dataset))
 
  current <- 1  
  for (i in s.indexes)
  {
    s <- subsets[[i]]
    s.len <- length(s)
 
    if (s.len > subset.max.size)
      next
 
    cat("Processing subset no.", current, "out of", plots, "\n")
 
    # Container for our values
    squared.dist <- numeric(n)
 
    # qr.solve(A) = A^(-1)  
    Sinv <- qr.solve(S[s,s])
 
    # Then calculate the squared distance for each datapoint
    for (i in 1:n)
    {
      c <- dataset[i,s] - ybar[s]
      squared.dist[i] <- t(c) %*% Sinv %*% c
    }
 
    # Finding the order statistic
    squared.dist <- sort(squared.dist)
 
    qqchisq(squared.dist, paste(colnames(dataset)[s], collapse=", "), s.len)
 
    current <- current + 1
  }
 
  if (!is.null(filename))
    dev.off()
}
 
# Example:
A <- matrix(rnorm(2000, mean=3, sd=2), ncol=8)
qqmultinorm(A, 1, 3, "multinorm-1-3")
#qqmultinorm(A, 4, 4, "multinorm-4")
#qqmultinorm(A, 5, 5, "multinorm-5")
#qqmultinorm(A, 6, 8, "multinorm-6-8")
multinorm-1-3

multinorm-1-3

Leave a Comment

Please note: Comment moderation is enabled and may delay your comment. There is no need to resubmit your comment.