% No 'submit' option for the problems by themselves.
\documentclass{cos302}
% Use the 'submit' option when you submit your solutions.
%\documentclass[submit]{cos302}
% Put in your full name and email address.
\name{Your Name}
\email{email@princeton.edu}
\discussants{Marie Curie \\ Leonhard Euler}
% You don't need to change these.
\course{COS302-F23}
\assignment{Assignment \#5}
\duedate{6:00pm Wednesday 25 October 2023}
\dropboxurl{https://www.gradescope.com/courses/606160/assignments/3456405}
\usepackage{bm} % for bold math
\usepackage{mathtools} % colon-equals
\begin{document}
% IMPORTANT: Uncomment the \documentclass command above with the [submit] option and put your name where it says ``your name''.
\begin{center}
Assignments in COS 302 should be done individually. See the \href{https://www.cs.princeton.edu/courses/archive/fall23/cos302/files/syllabus.pdf}{course syllabus} for the collaboration policy.
\vspace{\baselineskip}
\textcolor{red}{%
Remember to append your Colab PDF as explained in the first homework, with all outputs visible.\\
When you print to PDF it may be helpful to scale at 95\% or so to get everything on the page.}
\end{center}
\begin{problem}[20pts]
We sometimes need to introduce norms for matrices.
This comes up when we want to talk about the magnitude of a matrix, or when we need a notion of distance between matrices.
One important matrix norm is the Frobenius norm, which can be written in several ways for a matrix~$\bm{A}\in\mathbb{R}^{m \times n}$:
\begin{align*}
||\bm{A}||_F &= \sqrt{\sum_{i=1}^m\sum_{j=1}^n A_{i,j}^2}
= \sqrt{\text{trace}(\bm{A}^{T}\bm{A})}
\end{align*}
\begin{enumerate}[(A)]
\item Show that the Frobenius norm is also the square root of the sum of the squared singular values.
\item Assume that~$\bm{A}$ is square and invertible, with a very small Frobenius norm.
What kind of value would you expect to get for the Frobenius norm of~$\bm{A}^{-1}$?
\end{enumerate}
\end{problem}
\newpage
\begin{problem}[25pts]
In general, computing the determinant of an~$n \times n$ matrix scales as~$n^3$ in computational cost.
When the matrix is highly structured, however, it can sometimes be possible to take advantage of that structure to save computation for quantities such as the determinant.
One example of such structure is in \emph{tridiagonal} matrices, which look like this:
\begin{align*}
\bm{T} &= \begin{bmatrix}
a_1 & b_1 & 0 & 0 & \cdots & 0\\
c_1 & a_2 & b_2 & 0 & & 0\\
0 & c_2 & a_3 & \ddots & & \vdots\\
0 & 0 & \ddots & \ddots & b_{n-2}& 0\\
\vdots & & & c_{n-2} & a_{n-1} & b_{n-1} \\
0 & 0 & \cdots & 0 & c_{n-1} & a_n
\end{bmatrix}
\end{align*}
Such matrices can come up when simulating a physical system with local structure, e.g., a spring-mass system.
\begin{enumerate}[(A)]
\item We would like to compute the determinant of the matrix~$\bm{T}$ above, which we are assuming is invertible.
Let~$d_m$ denote the determinant of the upper left~$m\times m$ submatrix.
So,~$d_1=a_1$ and~$d_n=|\bm{T}|$ (computing the whole determinant).
Use \href{http://mathworld.wolfram.com/DeterminantExpansionbyMinors.html}{expansion by minors} to compute~$d_n$ in terms of~$d_{n-1}$,~$d_{n-2}$, and any of the~$a_i$,~$b_i$, or~$c_i$.
If necessary you can take~$d_0=1$ and~$d_{i}=0$ for $i<0$.
Carefully show how you arrived at your answer.
\item Based on this recurrence relation, how would you expect the computational cost to scale for computing the determinant of a tridiagonal matrix?
\end{enumerate}
\end{problem}
\newpage
\begin{problem}[23pts]
In this problem, you'll look at singular value decomposition as a way to find a low-rank approximation to a matrix.
\begin{enumerate}[(A)]
\item Upload any (tasteful) image you want to Colab.
Note that life is a little easier if you put it in your Google drive and access it from there so you don't have to constantly re-upload it every time you come back after a break.
Load the image as a NumPy array.
There are various ways to do this, but the easiest may be to use \href{https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.imread.html}{\texttt{imread}} from matplotlib.
There is a useful tutorial \href{https://matplotlib.org/tutorials/introductory/images.html}{here} also.
Once you have a NumPy array, look at its shape: it is probably $width \times height \times 3$ or something to that effect, with the third dimension usually reflecting red, green, and blue channels.
Make the image grayscale by taking the mean across this third dimension.
This should make it into a $width \times height$ array.
Use \texttt{imshow} as in previous homeworks to render the grayscale image.
\item Import \texttt{numpy.linalg} and use the \texttt{svd} function to compute the singular value decomposition of the image matrix.
This will return three things: a~$\bm{U}$ matrix, a vector containing the singular values, and a~$\bm{V}^T$ matrix.
Print the shapes of these arrays, and then figure out how to ``reassemble'' these three arrays with multiplication to reconstruct the image.
Render the reconstruction and verify that it looks like the original image.
\item Plot the \href{https://docs.scipy.org/doc/numpy/reference/generated/numpy.cumsum.html}{cumulative sum} of the singular values.
Sum them from largest to smallest.
\item Create three different low-rank approximations to your image.
Create one of rank 5, one of rank 10, and one of rank 25.
(Hint: you can do this by reconstructing the image matrix as above, but with zeros for all the singular values with index larger than the rank of your approximation.
Render each of the three images.
\item Create three new matrices, each with the pixel-wise squared difference between the original image and the low-rank approximation.
Render these as images and qualitatively describe what kind of visual structure seems to be lost in the low-rank approximations.
\end{enumerate}
\end{problem}
% \newpage
\begin{problem}[30pts]
In this problem you will use SVD to model a text corpus, a small subset of New York Times articles.
You will use a ``bag of words'' representation in which documents are represented by the counts of words, usually excluding very common ``stop words'' like \emph{the} and \emph{and}.
Download \href{https://www.cs.princeton.edu/courses/archive/fall23/cos302/files/nyt.pkl.gz}{\texttt{nyt.pkl.gz}} and upload it to your Google drive so you don't have to upload it every time you open the Colab; it's a fairly big file.
Here's some code to get you going:
\begin{lstlisting}[style=python]
import pickle as pkl
import numpy as np
import gzip
filename = 'drive/My Drive/COS 302/nyt.pkl.gz'
with gzip.open(filename, 'rb') as fh:
nyt = pkl.load(fh)
documents = nyt['docs']
vocab = nyt['vocab']
# Create reverse lookup table.
vocab_indices = dict([(w, i) for (i, w) in enumerate(vocab)])
M = len(documents)
N = len(vocab)
print('%d documents, %d words' % (M,N))
count_mat = np.zeros((M,N))
for mm, doc in enumerate(documents):
for word, count in doc.items():
count_mat[mm, vocab_indices[word]] = count
\end{lstlisting}
\begin{enumerate}[(A)]
\item Typically, raw counts don't lead to discovery of interesting structure.
Instead, it is common to use something like \href{https://en.wikipedia.org/wiki/Tf-idf}{TF-IDF}, which stands for \emph{term frequency-inverse document frequency}.
Term frequency is the number of times word~$n$ appeared in document~$m$, divided by the total number of words in document~$m$.
\begin{align*}
{\rm tf}_{m,n} &= \frac{\text{\# times word $n$ appears in doc $m$}}{\text{total \# of words in doc $m$}}
\end{align*}
Transform \texttt{count\_mat} from the code above into a term frequency matrix.
\item Inverse document frequency is typically the natural log of the number of documents, divided by the number of documents in which word~$m$ appears.
(The plus-one ensures that you're not dividing by zero.)
\begin{align*}
{\rm idf}_{n} &= \log\frac{ \text{total \# of documents }}{ 1 + \text{\# of documents with word $n$}}
\end{align*}
Compute the idf vector from \texttt{count\_mat}.
\item Now compute the TF-IDF matrix by multiplying (broadcasting) ${\rm tf}_{m,n}$ and~${\rm idf}_n$.
Use \href{https://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.svd.html}{\texttt{numpy.linalg.svd}} to take the SVD of this TF-IDF matrix; it may take a minute since the matrix is relatively large.
Plot the singular values in decreasing order.
\item The right singular vectors (columns of the~$\bm{V}$ matrix) will ideally represent interesting topical dimensions.
For each of the top 20 right singular vectors: identify the words in the vocabulary that have the largest entries in the vector and print them.
That will probably mean looping over the first 20 right singular vectors, doing an appropriate \texttt{argsort} and then finding that entry in the \texttt{vocab} variable.
Do you see any interesting qualitative structure in these groups of words?
\end{enumerate}
\end{problem}
\newpage
\begin{problem}[2pts]
Approximately how many hours did this assignment take you to complete?
\end{problem}
My notebook URL:
{\small \url{https://colab.research.google.com/XXXXXXXXXXXXXXXXXXXXXXX}}
\subsection*{Changelog}
\begin{itemize}
\item 2 October 2023 -- Initial F23 version.
\end{itemize}
% \includepdf[pages=-]{mynotebook.pdf}
\end{document}