demo <- function() { df <- read.csv("LabelMe.csv", as.is=T) # show a couple of random images from df bundle <- test.train(df, ntest=500) train <- bundle$train test <- bundle$test # show train[1:10,] and test[1:10,] to confirm that they don't # contain the same images model <- fit.classifier(train) # show model[["class.prop"]]; model[["class.mean"]]; model[["class.sd"]] plot.model(model) # show test[30,] and look at the picture / class # test.rgb <- as.double(test[30,3:5]) # classify(test.rgb, model) # compute the posterior class distribution for each test point test.post <- adply(test, 1, function (x) classify(as.double(x[3:5]), model)) # (optional) train.post <- adply(train, 1, function (x) classify(as.double(x[3:5]), model)) # form the arg.max classes best <- apply(test.post, 1, function (x) names(x[6:13])[which.max(x[6:13])]) test.post$best <- best # (optional) best <- apply(train.post, 1, function (x) names(x[6:13])[which.max(x[6:13])]) train.post$best <- best # show a couple of these: test.post[342, c("class", "best")] # make the confusion table conf <- table(test.post[,c("class", "best")]) conf <- aaply(conf, 1, function (x) x / sum(x)) # compute the accuracy acc <- sum(diag(conf)) / sum(conf) plot.confusion(conf) } # make test and train test.train <- function(df, ntest=500) { test.idx <- sort(sample(dim(df)[1], ntest, replace=F)) test <- df[test.idx,] train <- df[-test.idx,] list(train=train, test=test) } # fit a gaussian generative classifier fit.classifier <- function(df) { # fit the means and variances class.mean <- dlply(df, c("class"), function (x) apply(x[,c("ave_red", "ave_green", "ave_blue")], 2, mean)) class.sd <- dlply(df, c("class"), function (x) apply(x[,c("ave_red", "ave_green", "ave_blue")], 2, sd)) # fit the proportions class.prop <- table(df[,"class"]) class.prop <- class.prop / sum(class.prop) list(class.mean=class.mean, class.sd=class.sd, class.prop=class.prop) } classify <- function(rgb, model) { classes <- names(model$class.prop) post <- sapply(classes, function (class) { log(model[["class.prop"]][class]) + dnorm(rgb[1],mean=model[["class.mean"]][[class]][1], sd=model[["class.sd"]][[class]][1],log=T) + dnorm(rgb[2], mean=model[["class.mean"]][[class]][2], sd=model[["class.sd"]][[class]][2], log=T) + dnorm(rgb[3],mean=model[["class.mean"]][[class]][3], sd=model[["class.sd"]][[class]][3], log=T) }) post <- post - log.sum(post) names(post) <- classes post } # given log(v), returns log(sum(v)) log.sum <- function(v) { log.sum.pair <- function(x,y) { if ((y == -Inf) && (x == -Inf)) { return(-Inf); } if (y < x) return(x+log(1 + exp(y-x))) else return(y+log(1 + exp(x-y))); } if (length(v) == 1) return(v) r <- v[1]; for (i in 2:length(v)) r <- log.sum.pair(r, v[i]) return(r) } # plot a confusion matrix plot.confusion <- function(conf) { melted <- melt(conf, c("class", "best")) p <- ggplot(data=melted, aes(x=best, y=class, size=value)) p <- p + geom_point(shape=15) p <- p + scale_area(to=c(1,20)) p <- p + theme_bw() + opts(axis.text.x = theme_text(angle=90)) p } # plot model plot.model <- function(model) { melted <- melt(ldply(model[["class.mean"]], function (x) x)) p <- ggplot(data=melted) p <- p + geom_point(aes(x=variable, y=value)) p <- p + facet_grid(~ class) p <- p + theme_bw() + opts(axis.text.x = theme_text(angle=90)) p }