library(MASS) library(mclust) library(ggplot2) script <- function() { # first generate 1000 data points from a mixture of 5 gaussians d <- generate.data(1000, 5, c(0,0), 50) # plot the data and the truth plot.mixture(d$locs, d$z, d$obs) # plot the data alone plot.data(d$obs) # run EM on the data one <- run.em(k=5, x=d$obs) # plot the run of EM plot.em.run(one, d$obs) # run EM 100 times runs <- sapply(1:100, function (i) list(run.em(5, x=d$obs))) # let's look at the local optima df <- ldply(1:100, function (i) cbind(run=i, runs[[i]]$lhood)) ggplot(data=df, aes(x=iter, y=lhood, group=run)) + geom_line(alpha=0.5) # get the final log likelihoods for each run lls <- laply(runs, function (run) run$estep$loglik) # get the minimum and plot it which.min(lls) plot.em.run(runs[[67]], d$obs) # get the maximum and plot it which.max(lls) plot.em.run(runs[[93]], d$obs) # tada (?) } # ------------------------------------------------------------------------ # generate data from a mixture of 2d gaussians generate.data <- function(n, k, prior.mean, prior.var) { p <- length(prior.mean) # generate mixture locations from the prior locs <- mvrnorm(k, mu=prior.mean, Sigma=diag(prior.var, 2)) vars <- rgamma(k, shape=1, scale=0.25) # generate the data obs <- matrix(0, nrow=n, ncol=p) z <- numeric(n) for (i in 1:n) { # draw the cluster uniformly at random z[i] <- sample(1:k, 1) # draw the observation from the corresponding mixture location obs[i,] <- mvrnorm(1, mu=locs[z[i],], Sigma=diag(vars[z[i]],p)) } list(locs=locs, vars=vars, z=z, obs=obs) } # run EM run.em <- function(k, x) { # compute the number of data points n <- dim(x)[1] # initialize each data point to a random cluster init.z <- unmap(sample(1:k, n, replace=T)) # compute the first "m step" with those posteriors mstep <- mstep(modelName="VII", data = x, z = init.z) estep <- estep(modelName="VII", data=x, parameters=mstep$parameters) iter <- 1 lhood <- data.frame(iter=iter, lhood=estep$loglik) repeat { iter <- iter + 1 mstep <- mstep(modelName="VII", data=x, z=estep$z) estep <- estep(modelName="VII", data=x, parameters=mstep$parameters) lhood <- rbind(lhood, c(iter=iter, lhood=estep$loglik)) conv <- abs((lhood[iter,"lhood"] - lhood[iter-1,"lhood"]) / lhood[iter-1,"lhood"]) # cat(sprintf("%03d : %02.3g\n", iter, conv)) if (conv < 1e-5) break } list(estep=estep, mstep=mstep, lhood=lhood) } plot.em.run <- function(run, x) { z <- apply(run$estep$z, 1, function (x) which.max(x)) plot.mixture(locs=t(run$mstep$parameters$mean), z=z, obs=x) } # ------------------------------------------------------------------------ # # helper functions # # # plot 2d data from a mixture plot.mixture <- function(locs, z, obs) { stopifnot(dim(obs)[2]==2) z <- as.factor(z) df1 <- data.frame(x=obs[,1], y=obs[,2], z=z) df2 <- data.frame(x=locs[,1], y=locs[,2]) p <- ggplot() p <- p + geom_point(data=df1, aes(x=x, y=y, colour=z), shape=16, size=2, alpha=0.75) p <- p + geom_point(data=df2, aes(x=x, y=y), shape=16, size=3) p <- p + opts(legend.position="none") p } # plot 2d data as a scatter plot plot.data <- function(dat) { stopifnot(dim(dat)[2]==2) df1 <- data.frame(x=dat[,1], y=dat[,2]) p <- ggplot() p <- p + geom_point(data=df1, aes(x=x, y=y), size=2, alpha=0.75) p }