% No 'submit' option for the problems by themselves.
\documentclass{cos302}
% Use the 'submit' option when you submit your solutions.
%\documentclass[submit]{cos302}
% Put in your full name and email address.
\name{Your Name}
\email{email@princeton.edu}
\discussants{Marie Curie \\ Leonhard Euler}
% You don't need to change these.
\course{COS302-S20}
\assignment{Assignment \#10}
\duedate{6:00pm Wednesday 6 December 2023}
\dropboxurl{https://www.gradescope.com/courses/606160/assignments/3686642}
\usepackage{bm} % for bold math
\usepackage{mathtools} % colon-equals
\usepackage{multirow}
\usepackage{graphicx}
\usepackage{amssymb}
\usepackage{amsmath}
\usepackage{bbm}
\usepackage{underscore}
\begin{document}
% IMPORTANT: Uncomment the \documentclass command above with the [submit] option and put your name where it says ``your name''.
\begin{center}
Assignments in COS 302 should be done individually. See the \href{https://www.cs.princeton.edu/courses/archive/fall23/cos302/files/syllabus.pdf}{course syllabus} for the collaboration policy.
\vspace{\baselineskip}
\textcolor{red}{%
Remember to append your Colab PDF as explained in the first homework, with all outputs visible.\\
When you print to PDF it may be helpful to scale at 95\% or so to get everything on the page.}
\end{center}
\begin{problem}[35pts]
In gradient descent, we attempt to minimize some function $f(\bm{x})$ by starting with some initial~$\bm{x}_0$ and then iteratively modifying the parameters $\bm{x} \in \mathbb{R}^n$ according to the following formula:
\begin{align*}
\bm{x}_{t+1} &\gets \bm{x}_t - \lambda (\nabla_{\bm{x}} f(\bm{x}_t))^T
\end{align*}
for some small $\lambda \geq 0$ known as the \emph{learning rate} or \emph{step size}.
(Here we're taking the gradient to be a row vector, which is why we're transposing it to update a column vector.)
This procedure modifies $\bm{x}$ so as to move in a direction proportional to the negative gradient.
This basic procedure is the basis for many optimization algorithms across science and engineering.
Consider the simple function $f(\bm{x}) = \bm{x}^T A \bm{x}$ for a (constant) symmetric positive definite matrix $A \in \mathbb{R}^{n \times n}$.
\begin{enumerate}[(A)]
\item Implement a function \texttt{f(A, x)} that takes as input an $n \times n$ numpy array \texttt{A} and a 1D array \texttt{x} of length $n$ and returns the output for~$f(\bm{x})$ above.
Test it on the values
\begin{align*}
\bm{x} &= \begin{bmatrix}
-1 \\
2
\end{bmatrix}
&
\bm{A} &= \begin{bmatrix}
3 & -1 \\
-1 & 2
\end{bmatrix}\,.
\end{align*}
\item Implement a function \texttt{grad\_f(A, x)} that takes the same two arguments as above but returns $\nabla_{\bm{x}} f(\bm{x})$ evaluated at \texttt{x}.
Test it on the values~$\bm{x}$ and~$\bm{A}$ above.
\item Now implement a third and final function \texttt{grad\_descent(A, x0, lr, num\_iters)} that takes the additional arguments \texttt{lr}, representing the learning rate $\lambda$ above, and \texttt{num\_iters} indicating the total number of iterations of gradient descent to perform.
The function should loop and print the values of $\bm{x}_t$ and $f(\bm{x}_t)$ at each iteration of gradient descent.
\item Use your function to perform gradient descent on $f$ with
\begin{align*}
\bm{x}_0 &= \begin{bmatrix}
10 \\
10
\end{bmatrix}
&
\bm{A} &= \begin{bmatrix}
1 & 0 \\
0 & 4
\end{bmatrix}\,.
\end{align*}
Run gradient descent for 50 iterations with learning rates of 1, 0.25, 0.1, and 0.01.
What do you notice?
Does $\bm{x}_t$ always converge to the same value?
Does our gradient descent algorithm work every time?
\end{enumerate}
\end{problem}
\newpage
\begin{problem}[35pts]
Debugging gradients is one of the important skills for machine learning.
One powerful debugging tool is to use \emph{finite differences} to estimate the gradient of a function numerically.
The idea of finite differences is simple: estimate the rise versus run of the function using a small step size.
This is kind of like when you take the limit in the definition of the derivative, except you just don't take the limit quite to zero.
If~$f:\mathbb{R}^d\to\mathbb{R}$ and~$\epsilon >0$, then
\begin{align*}
\frac{\partial}{\partial x_i}f(\bm{x}) &\approx \frac{1}{2\epsilon}(f(x_1, \ldots, x_i+\epsilon, \ldots, x_d) - f(x_1, \ldots, x_i-\epsilon, \ldots, x_d)\,.
\end{align*}
This is a ``two-sided'' finite differences estimate because it is moving up and down relative to~$\bm{x}$.
In a typical application, you might code this up using~$\epsilon\approx 10^{-4}$ or so.
To estimate a gradient, you would loop from~$i=1\ldots d$ and evaluate all the partials.
\begin{enumerate}[(A)]
\item Implement a function \texttt{finite_diff(func, x, epsilon)} that takes a function \texttt{func} as an argument (which itself is of the form~\texttt{func(x)} taking a vector and returning a scalar) and estimates the gradient via finite differences as above.
This will involve a loop over the entries of \texttt{x} that modifies the entries and evaluates the function twice.
The function~\texttt{finite_diff} should return a vector that is the same size as~\texttt{x}.
\item Consider the function $f(\bm{x}) = (\bm{c}-\bm{A}\bm{x})^{\text{T}}(\bm{c}-\bm{A}\bm{x})$ for~$\bm{x}\in\mathbb{R}^5$ where
\begin{align*}
\bm{c} &= \begin{bmatrix}
1 \\
2 \\
3
\end{bmatrix} &
\bm{A} &= \begin{bmatrix}
1 & 2 & 3 & 4 & 5\\
6 & 7 & 8 & 9 & 10\\
11 & 12 & 13 & 14 & 15
\end{bmatrix}
\end{align*}
Implement this function and its gradient in Python as a function of~\texttt{x}.
The gradient function will require that you use the differentiation identities as in Sec 5.5 of the book, just like you did in HW9; just now you'll also write it in code.
\item Choose some non-zero vector for \texttt{x} and compare the result from your implementation to what you get from the finite differences estimate.
They should be similar but not exactly the same.
(If you get weird values that are all integers, make sure you're using floating point numbers in your numpy arrays.)
\item Now try automatic differentiation. Rather than \texttt{import numpy as np} do something like this:
\begin{verbatim}
import autograd.numpy as np
from autograd import grad
\end{verbatim}
Then get a third estimate of the gradient by calling \texttt{grad} on your function from part (B) and evaluating that at \texttt{x}.
\end{enumerate}
\end{problem}
\newpage
\begin{problem}[28pts]
\begin{enumerate}[(A)]
\item Use \texttt{autograd} as in the previous problem to write a general-purpose function that takes an update step for Newton's method of minimization. The signature for this function should look like \texttt{newton_step(func, x)}. You'll need to use \texttt{grad} to compute first and second derivatives within the function.
\item We're going to minimize the function $f(x)=\sin(x)/x$, which is sometimes called the ``sinc'' function.
To get a little bit of intuition for it, start by plotting it on the range $[-8,8]$.
\item
Try minimizing the sinc function starting from several different points: $x_0=3.0$, $x_0=-4.0$, and $x_0=0.5$.
You'll need to write a loop that iterates your \texttt{newton_step} function.
You should have the loop stop when the update only takes a small step, say one smaller in magnitude than $10^{-4}$.
You should expect the loop to only need to run for a few iterations (not more than ten).
Explain what happens for the different initializations.
\end{enumerate}
\end{problem}
\newpage
\begin{problem}[2pts]
Approximately how many hours did this assignment take you to complete?
\end{problem}
My notebook URL:
{\small \url{https://colab.research.google.com/XXXXXXXXXXXXXXXXXXXXXXX}}
\subsection*{Changelog}
\begin{itemize}
\item 15 November 2023 -- Initial F23 version.
\end{itemize}
%\includepdf[pages=-]{hw10solnb.pdf}
\end{document}