# the MASS library contains the multivariate gaussian library(MASS) # the ggplot2 library contains functions for making nice plots library(ggplot2) # generate data from a mixture of 2d gaussians generate.data <- function(n, k, prior.mean, prior.var) { p <- length(prior.mean) # generate mixture locations from the prior locs <- mvrnorm(k, mu=prior.mean, Sigma=diag(prior.var, 2)) # generate the data obs <- matrix(0, nrow=n, ncol=p) z <- numeric(n) for (i in 1:n) { # draw the cluster uniformly at random z[i] <- sample(1:k, 1) # draw the observation from the corresponding mixture location obs[i,] <- mvrnorm(1, mu=locs[z[i],], Sigma=diag(1,p)) } list(locs=locs, z=z, obs=obs) } # variational inference # the user must specify (a) the number of components and (b) the prior # over mixture locations mixt.gauss.vb <- function(obs, k, mu0, sigma0, conv.criterion = 1e-10, make.movie=F) { n <- dim(obs)[1] p <- dim(obs)[2] # initialize the locations var.mu <- mvrnorm(k, colMeans(obs), diag(sigma0/100, p)) var.sigma <- rep(sigma0, k) # initialize the group distributions to be uniform var.z <- matrix(1/k, nrow=n, ncol=k) # set up initial elbo and movie elbo <- mixt.gauss.elbo(obs, var.z, var.mu, var.sigma, mu0, sigma0) decomp <- inferred.decomp(var.mu, var.sigma, var.z) # set up movie movie <- list() if (make.movie) movie <- list(plot.mixture(decomp$locs, decomp$z, obs)) # until convergence conv <- 1 iter <- 0 while (conv > conv.criterion) { iter <- iter + 1 # for each data point for (i in 1:n) { # compute the variational posterior on the cluster log.q.z <- numeric(k) for (j in 1:k) { log.q.z[j] <- -log(k) log.q.z[j] <- log.q.z[j] + obs[i,]%*%var.mu[j,] log.q.z[j] <- log.q.z[j] - (2*var.sigma[j]+var.mu[j,]%*%var.mu[j,])/2 } log.sum.q.z <- log.sum(log.q.z) var.z[i,] <- exp(log.q.z - log.sum.q.z) } # for each mixture component for (j in 1:k) { # compute the variational posterior on locations var.mu[j,] <- mu0/sigma0 + colSums(var.z[,j] * obs) var.mu[j,] <- var.mu[j,] / (1/sigma0 + sum(var.z[,j])) var.sigma[j] <- 1/(1/sigma0 + sum(var.z[,j])) } # compute elbo, convergence criterion, and plot elbo <-c(elbo, mixt.gauss.elbo(obs, var.z, var.mu, var.sigma, mu0, sigma0)) conv <- (elbo[iter+1] - elbo[iter]) / abs(elbo[iter]) # compute diagnostic plot if (make.movie) { decomp <- inferred.decomp(var.mu, var.sigma, var.z) movie <- append(movie, list(plot.mixture(decomp$locs, decomp$z, obs))) } msg(sprintf("iteration %d; convergence %g", iter, conv)) } list(var.mu=var.mu, var.sigma=var.sigma, var.z=var.z, elbo=elbo, movie=movie) } mixt.gauss.elbo <- function(obs, var.z, var.mu, var.sigma, mu0, sigma0) { p <- dim(obs)[2] n <- dim(obs)[1] k <- dim(var.mu)[1] b <- 0 # E[prior] and entropy of mixture locations for (j in 1:k) { b <- b - (p/2)*log(2*pi*sigma0) b <- b - (p*var.sigma[j]+var.mu[j,]%*%var.mu[j,])/(2*sigma0) b <- b + (var.mu[j,]%*%mu0)/sigma0 b <- b - (mu0%*%mu0)/ (2*sigma0) b <- b + (p/2)*log(2*pi*var.sigma[j]) + (p/2) } # E[prior on z] + E[likelihood] + entropy of q(z) for (i in 1:n) { b <- b - log(k) b <- b - (1/2)*log(2*pi)-obs[i,]%*%obs[i,]/2 for (j in 1:k) { b <- b + var.z[i,j]*(obs[i,]%*%var.mu[j,]) b <- b - var.z[i,j]*(2*var.sigma[j] + var.mu[j,]%*%var.mu[j,])/2 b <- b - var.z[i,j] * safe.log(var.z[i,j]) } } b } inferred.decomp <- function(var.mu, var.sigma, var.z) { inf.z <- aaply(var.z, 1, function (x) which.max(x)) inf.locs <- var.mu list(locs=var.mu, z=inf.z) } # note: only for 2d inferred.pred <- function(var.mu, var.sigma, var.z) { k <- dim(var.mu)[1] function (x,y) { val <- sapply(1:k, function (j) dnorm(x, var.mu[j,1], 1, log=T) + dnorm(y, var.mu[j,2], 1, log=T)) val <- log.sum(val) - log(k) val } } # ------------------------------------------------------------------------ # # helper functions # # # plot 2d data from a mixture plot.mixture <- function(locs, z, obs) { stopifnot(dim(obs)[2]==2) z <- as.factor(z) df1 <- data.frame(x=obs[,1], y=obs[,2], z=z) df2 <- data.frame(x=locs[,1], y=locs[,2]) p <- ggplot() p <- p + geom_point(data=df1, aes(x=x, y=y, colour=z), shape=16, size=2, alpha=0.75) p <- p + geom_point(data=df2, aes(x=x, y=y), shape=16, size=3) p <- p + opts(legend.position="none") p } # plot 2d data as a scatter plot plot.data <- function(dat) { stopifnot(dim(dat)[2]==2) df1 <- data.frame(x=dat[,1], y=dat[,2]) p <- ggplot() p <- p + geom_point(data=df1, aes(x=x, y=y), size=2, alpha=0.75) p } # plot predictive distribution and data plot.pred.and.obs <- function(pred, obs) { mydf <- fun2d.on.grid(pred, range(obs[,1]), range(obs[,2]), n=100) p <- ggplot() p <- p + geom_contour(data=mydf, aes(x=x, y=y, z=z)) p <- p + geom_point(aes(x=d$obs[,1], y=d$obs[,2]), alpha=0.2) p } # given a vector log(a), return log(sum(a)) log.sum <- function(v) { log.sum.pair <- function(x,y) { if ((y == -Inf) && (x == -Inf)) { return(-Inf); } if (y < x) return(x+log(1 + exp(y-x))) else return(y+log(1 + exp(x-y))); } if (length(v) == 1) return(v) r <- v[1]; for (i in 2:length(v)) r <- log.sum.pair(r, v[i]) return(r) } # safe log safe.log <- function(x) { if (x < 1e-40) -10000 else log(x) } # plot contour fun2d.on.grid <- function(f, xlim, ylim, n=100) { x <- seq(from=xlim[1], to=xlim[2], by=(xlim[2] - xlim[1])/n) y <- seq(from=ylim[1], to=ylim[2], by=(ylim[2] - ylim[1])/n) my.df <- matrix(0, nrow=length(x) * length(y), ncol=3) idx <- 1 for (xi in x) for (yj in y) { my.df[idx,] <- c(x=xi, y=yj, z=f(xi, yj)) idx <- idx + 1 } my.df <- data.frame(my.df) colnames(my.df) <- c("x", "y", "z") my.df } # msg msg <- function(s, ...) { time <- format(Sys.time(), "%X") cat(sprintf("%s %s\n", time, s)) }