% No 'submit' option for the problems by themselves.
\documentclass{cos324}
% Comment out the line above and uncomment the line below for submitting.
% \documentclass[submit]{cos324}
\usepackage{newtxtext,newtxmath}
\usepackage{bm}
% Put in your full name and email address.
\name{Your Name}
\email{email@princeton.edu}
% You don't need to change these.
\course{COS324-F18}
\assignment{Assignment \#4}
\duedate{23:55pm Friday 12 April 2019}
\dropboxurl{https://dropbox.cs.princeton.edu/COS324_S2019/HW4}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
% If you want to include graphics.
\usepackage{graphicx}
\begin{document}
% IMPORTANT: Uncomment the \documentclass command above with the [submit] option.
\begin{problem}[1pt]
Use the instructions above to put your name on the assignment when you compile this document.
\end{problem}
\begin{problem}[10pts]
In~$d$ dimensions, consider a hypersphere of unit radius, centered at zero, which is inscribed in a hypercube, also centered at zero, with edges of length two.
What fraction of the hypercube's volume is contained within the hypersphere?
Write this as a function of~$d$ and make a log-log plot.
What happens when~$d$ becomes large?
[You can just look up the equation for the volume of a hypersphere.]
\end{problem}
\begin{problem}[15pts]
Consider a $d$-dimensional Gaussian distribution with zero mean and identity covariance matrix, i.e.,~$\mathcal{N}(0,\bm{I}_d)$.
Imagine drawing points from this distribution and calculating their squared~$L^2$ distance from the origin.
What distribution will these squared distances have for a given~$d$?
[Hint: look up the gamma distribution.]
Plot the probability density function over the squared distance for $d=1, 10, 100, 1000, 10000$.
Simulate many such Gaussian random variables and create a histogram to verify that your calculations are correct.
\end{problem}
\begin{problem}[34pts]
Implement K-Means clustering from scratch.
That is, don't use a third-party machine learning implementation like \texttt{scikit-learn}; math libraries like \texttt{numpy} are fine.
Go out and grab an image data set like:
\begin{itemize}
\item CIFAR-10 or CIFAR-100: \\ \url{http://www.cs.toronto.edu/~kriz/cifar.html}
\item MNIST Handwritten Digits: \\ \url{http://yann.lecun.com/exdb/mnist/}
\item Small NORB (toys): \\ \url{http://www.cs.nyu.edu/~ylclab/data/norb-v1.0-small/}
\item Street View Housing Numbers: \\ \url{http://ufldl.stanford.edu/housenumbers/}
\item STL-10: \\ \url{http://cs.stanford.edu/~acoates/stl10/}
\item Labeled Faces in the Wild: \\ \url{http://vis-www.cs.umass.edu/lfw/}
\end{itemize}
Figure out how to load it into your environment and turn it into a set of vectors.
Run K-Means on it for a few different~$K$ and show some results from the fit. What do the mean images look like?
What are some representative images from each of the clusters?
Are the results wildly different for different restarts and/or different~$K$?
Plot the K-Means objective function (distortion measure) as a function of iteration and verify that it never increases.
\end{problem}
\begin{problem}[20pts]
Implement K-Means++ and see if it gives you more satisfying initializations for K-Means.
Explain your findings.
\end{problem}
\begin{problem}[20pts]
Download the \texttt{cities100.csv} data set from the course website.
This file contains the latitude and longitude of the 100 most populous cities in the world.
Convert these latitudes and longitudes into pairwise distances using geodesic or great-circle distance (look at \texttt{geopy} for this conversion).
With this distance matrix in hand, use \texttt{scipy.cluster.hierarchy} to explore these data with hierarchical clustering.
Produce at least three different dendrograms (with the city names labeled) that use different configurations of linkage, etc.
Explain any differences you see arising from different choices of linkage.
\end{problem}
\subsection*{Changelog}
\begin{itemize}
\item 1 April 2019 -- Initial version.
\end{itemize}
\end{document}