% No 'submit' option for the problems by themselves.
\documentclass{cos324}
% Use the 'submit' option when you submit your solutions.
% \documentclass[submit]{cos324}
\usepackage{newtxtext,newtxmath}
\usepackage{bm}
% Put in your full name and email address.
\name{Your Name}
\email{email@princeton.edu}
% You don't need to change these.
\course{COS324-S19}
\assignment{Assignment \#1}
\duedate{23:55pm February 15, 2019}
\dropboxurl{https://dropbox.cs.princeton.edu/COS324\_S2019/HW1}
% If you want to include graphics.
\usepackage{graphicx}
\begin{document}
\begin{problem}[5pt]
In COS 324 you'll typeset your assignments in \LaTeX, which is the way that almost everything is composed in mathematical fields.
It's a good thing to learn and it can produce beautiful documents.
There are a variety of ways to write and compile \LaTeX, from web-based tools like \href{https://www.overleaf.com/}{Overleaf}, to various GUI tools for Mac, Linux and Windows, to plain text editors like Emacs.
In this problem, figure out how to compile this document to a PDF in such a way that it shows your name an email in the top left, following the instructions in the comments above.
Note that compiling this document does not automatically upload it or anything; when you're done you will still need to upload the PDF (and your code) to the CS dropbox URL at the top of the page.
\end{problem}
\section*{Refreshing Probability and Calculus}
\begin{problem}[5pts]
$X$ and~$Y$ are two independent random variables with distributions~$p_X(x)$ and~$p_Y(y)$, respectively. Show that the independence of~$X$ and~$Y$ implies that their covariance is zero.
\end{problem}
\bigskip
\begin{problem}[5pts]
Let ${X\in\{0,1\}}$ be a binary random variable with a Bernoulli
distribution. Suppose~${p(X=1) = \theta}$, where $\theta$ is $0<\theta<1$ and $\theta \in \mathbb{R}$. That is, the probability of $X=x$ given~$\theta$ is
\begin{align*}
p(x\,|\,\theta) &= \theta^{x}(1-\theta)^{1-x}.
\end{align*}
Prove that~${\mathbb{E}[X] = \theta}$ and ${\text{var}[X] = \theta(1-\theta)}$.
\end{problem}
\bigskip
\begin{problem}[9pts]
Let $A, B, C \in \mathbb{R}^{n\times n}$ be invertible symmetric matrices and $\bm{x} \in \mathbb{R}^{n}$. We further suppose that all these matrices commute with each other.\\
\noindent A. Show that if $M, N \in \mathbb{R}^{n\times n}$ are two symmetric matrices that commute then $MN$ is also a symmetric matrix.\\
\noindent Compute the gradients of the following quantities with respect to~$\bm{x}$.
\noindent B. $\bm{x}^{T} \bm{A} \bm{x}$
\noindent C. $\text{tr}(\bm{A}\bm{x}\bm{x}^{T}\bm{B})$
\noindent D. $\exp\{ - (\bm{A}\bm{x} - \bm{b})^{T}\bm{C}^{-1}\bm{x}\}$
\end{problem}
\bigskip
\begin{problem}[6pts]
Let $X$ be a Gaussian random variable with mean~$\mu$ and variance~$\sigma^2$.
The probability density function for~$X$ is
\begin{align*}
p_X(x) &= \frac{1}{\sqrt{2\pi\sigma^2}}\exp\{-\frac{1}{2\sigma^2}(x-\mu)^2\}\,.
\end{align*}
We now transform~$X$ into a variable~$Y=e^X$.
\noindent A. What is the PDF for~$Y$, $p_Y(y)$?
\noindent B. What is the mean of~$Y$?
\end{problem}
\section*{Least Squares Regression}
\begin{problem}[15pts]
Prove that the least squares objective function
\begin{align*}
L(\bm{w}) &= (\bm{X}\bm{w} - \bm{y})^{T}(\bm{X}\bm{w} - \bm{y})
\end{align*}
is convex with respect to~$\bm{w}$.
\end{problem}
\bigskip
\begin{problem}[25pts]
When fitting a machine learning model, it may happen sometimes that we wish to weight some data more than others when we fit parameters.
Consider a weighted data set~$\{\bm{x}_n,y_n,r_n\}^N_{n=1}$, where~$\bm{x}_n\in\mathbb{R}^D$ (with the constant $1$ column already included),~$y_n\in\mathbb{R}$, and~$r_n>0$.
Here the~$r_n$ are weights for each of the data and we'd like to perform least-squares regression accounting for these weights with the following loss:
\begin{align*}
L(\bm{w}) &= \frac{1}{\sum_{n=1}^N r_n} \sum_{n=1}^N r_n (\bm{x}_n^{T}\bm{w} - y_n)^2\,.
\end{align*}
Dividing by the sum here is generalizing the division by~$N$ in the unweighted case.
Derive a closed-form solution for~$\bm{w}$ that minimizes this loss.
Hint: it will be helpful to construct a diagonal matrix~$\bm{R}$ such that~${R_{n,n}=r_n}$.
\end{problem}
\newpage
\section*{Maximum Likelihood Linear Regression}
\begin{problem}[30pts]
Here are some simple data to regress:
\begin{verbatim}
x = [-1.87, -1.76, -1.67, -1.22, -0.07, 0.11, 0.67, 1.60, 2.22, 2.51]
y = [0.06, 1.67, 0.54, -1.45, -0.18, -0.67, 0.92, 2.95, 5.13, 5.18]
\end{verbatim}
For each of the four feature representations below, use maximum likelihood estimation to fit the regression weights and the variance.
For each, 1)~report the MLE regression weights and variance, and 2)~make a plot that shows the data, the resulting predictive mean, $2\sigma$ predictive bands around the mean.
(That is plot a line $2\sigma$ above the mean and another~$2\sigma$ below the mean.)
Which of the representations seems to fit the data best?
Explain your reasoning.
Turn in the Python file that produces these plots.
\noindent A. $\Phi(x) = [x, 1]$
\noindent B. $\Phi(x) = [x^2, x, 1]$
\noindent C. $\Phi(x) = [\sin(3x), \sin(2x), \sin(x), 1]$
\noindent D. $\Phi(x) = [e^{-(x-2)^2}, e^{-(x-1)^2}, e^{-x^2}, e^{-(x+1)^2}, e^{-(x+2)^2}, 1]$
\end{problem}
\subsection*{Changelog}
\begin{itemize}
\item 6 February 2019 -- Initial version.
\end{itemize}
\end{document}