% No 'submit' option for the problems by themselves.
\documentclass{cos324}
% Comment out the line above and uncomment the line below for submitting.
%\documentclass[submit]{cos324}
\usepackage{newtxtext,newtxmath}
\usepackage{bm}
% Put in your full name and email address.
\name{Your Name}
\email{email@princeton.edu}
% You don't need to change these.
\course{COS324-S19}
\assignment{Assignment \#2}
\duedate{23:55pm Weds March 6, 2019}
\dropboxurl{https://dropbox.cs.princeton.edu/COS324\_S2019/HW2}
% If you want to include graphics.
\usepackage{graphicx}
\begin{document}
% IMPORTANT: Uncomment the \documentclass command at the top with the [submit] option. Otherwise your name won't appear in the doc.
\begin{problem}[1pt]
Use the instructions above to put your name on the assignment when you compile this document.
\end{problem}
\begin{problem}[14pts]
Answer the following questions about a classification problem where:\\\\ 1) there is a data set~$\{\bm{x}_n,y_n\}^N_{n=1}$ where~$\bm{x}_n\in\mathbb{R}^D$ and~$y_n\in \{0,1\}$, \\2) we will use logistic regression to model the data, \\ 3) we will use a squared~$L^2$ norm on the weights as a regularizer.\\
\noindent A. Write the log likelihood, denoting the weight of the regularization penalty as~$\lambda$ and the regression weights as~$\bm{w}$.
\noindent B. Prove that the penalized log likelihood is concave.
\end{problem}
\begin{problem}[15pts]
Imagine that we're doing basic linear regression with a probabilistic interpretation. We have data~$\{\bm{x}_n,y_n\}^N_{n=1}$ where~$\bm{x}_n\in\mathbb{R}^D$ and~$y_n\in\mathbb{R}$. We're using a Gaussian likelihood where~$y_n\,|\,\bm{x}_n,\bm{w},v \sim \mathcal{N}(y_n\,|\,\bm{w}^T\bm{x}_n,v)$. We have some \emph{a priori} knowledge about the data and so we want to make fairly strong assumptions about the weights. To reflect this knowledge we use a prior~$\bm{w} \sim \mathcal{N}(\bm{w}\,|\,\bm{\mu}, \bm{\Sigma})$. Derive the maximum \emph{a posteriori} MAP estimate of~$\bm{w}$ in terms of~$\bm{X}, \bm{y},v,\bm{\mu}$, and~$\bm{\Sigma}$.
\end{problem}
\begin{problem}[20pts]
One way we can classify data is using a \emph{generative} classifier.
A generative classifier posits a distribution over the data in which the class is an unknown \emph{latent variable}.
The model then assumes that the data were generated from a class-conditional distribution.
We learn such a classifier by fitting the parameters of each class by itself, along with the \emph{a priori} weight between the two classes, and then using Bayes' theorem.
Imagine that we have a binary classification problem with features in~$\mathbb{R}^D$ and that we've already fit two Gaussians to the two classes:
\begin{align}
\Pr(\bm{x} \,|\, \text{Class $1$}) &= \mathcal{N}(\bm{x}\,|\,\bm{\mu}_1,\bm{\Sigma}) &
\Pr(\bm{x} \,|\, \text{Class $2$}) &= \mathcal{N}(\bm{x}\,|\,\bm{\mu}_2,\bm{\Sigma})\,.
\end{align}
So the class-conditional distributions have the same covariance, but different means.
We have also already fit the marginal distributions over the two classes and found them to be equal:
\begin{align}
\Pr(\text{Class $1$}) &= \Pr(\text{Class $2$}) = \frac{1}{2}\,.
\end{align}
Classification can then be done for a new datum~$\bm{x}'$ by asking about the conditional distribution~$\Pr(\text{Class}\,|\,\bm{x}')$.
Show that the decision boundary between these two classes is a hyperplane and give an equation for that plane.
\end{problem}
\begin{problem}[10pts]
\label{prob:transform}
A. (Scalar Case) Let~$X$ be a standard normal random variable, i.e.,~$X \sim \mathcal{N}(0,1)$. Let~$Y$ be the linear transformation~$ Y = aX + b$ for some fixed scalars~$a$ and~$b$. Prove that~$Y \sim \mathcal{N}(b, a^2)$.\\\\
B- (Vector Case) Let~$X\in\mathbb{R}^D$ be a standard normal random vector, i.e.,~$X \sim \mathcal{N}(0,\bm{I}_D)$, where $\bm{I}_D$ is the identity matrix in dimension $D$. Let~$Y$ be the linear transformation~$Y= \bm{A}X + \bm{b}$ where ~$\bm{A}\in \mathbb{R}^{D\times D }$ is an invertible matrix and~$\bm{b} \in \mathbb{R}^D $. Prove that~$Y \sim \mathcal{N}(\bm{b}, \bm{A}\bm{A}^{T})$.\\
\end{problem}
\begin{problem}[40pts]
In this problem, we'll apply logistic regression to predict whether a person will default on a home equity loan.
There are two data files: \texttt{hmeq-train.csv} and \texttt{hmeq-test.csv} with 4000 and 1357 data, respectively.
These are comma-delimited CSV files with the following columns:
\begin{itemize}
\item \texttt{BAD}: Value is 1 if the loan was bad, 0 if it was paid back. This is the quantity you will predict with your logistic regression model.
\item \texttt{LOAN}: Amount of the loan.
\item \texttt{MORTDUE}: Amount of existing mortgage.
\item \texttt{VALUE}: Value of current property.
\item \texttt{REASON}: $\{$\texttt{DebtCon}, \texttt{HomeImp}, \texttt{Unknown}$\}$
\item \texttt{JOB}: $\{$\texttt{Mgr}, \texttt{Office}, \texttt{Other}, \texttt{ProfExe}, \texttt{Self}, \texttt{Sales}, \texttt{Unknown}$\}$
\item \texttt{YOJ}: Years at present job. -1 if unknown.
\item \texttt{DEROG}: Number of major derogatory reports. -1 if unknown.
\item \texttt{DELINQ}: Number of delinquent credit lines. -1 if unknown.
\item \texttt{CLAGE}: Age of oldest credit line in months. -1 if unknown.
\item \texttt{NINQ}: Number of recent credit inquiries. -1 if unknown.
\item \texttt{CLNO}: Number of credit lines. -1 if unknown.
\end{itemize}
You will build a logistic regression model to predict the \texttt{BAD} column, using the training data.
You will need to load the CSV file (look into the \texttt{csv} python module) and turn the values into useful data that you can use as features.
In particular, that will probably mean using a one-hot coding for the categorical variables.
(Optionally, you may also want to replace the unknown -1 values with some other quantity, like the mean of that column; this is called \href{https://en.wikipedia.org/wiki/Imputation_(statistics)}{\emph{data imputation}}.)
In your first pass, don't do anything fancy with basis functions, just use the raw features and a bias term.
\noindent A. Write code to compute the training and test log likelihoods. It is likely that it will not return anything sensible. Explain why that might be.
\noindent B. Standardize the training data. That means take each of the continuous features, subtract its mean and divide by its standard deviation. This will make it zero-mean and have unit variance. Make sure to store the means and standard deviations of each column, as we'll need to transform the test data with the same values. It's bad hygiene to standardize the test data using its own mean and standard deviation.
\noindent C. Use full-batch gradient ascent to learn the weights of the logistic regression model. Try learning rates over a several orders of magnitude (all less than one) and report what works and how many iterations are necessary to reach convergence. Plot the log likelihoods (\emph{training curves}) as a function of iteration for three different learning rates that you tried.
\noindent D. What final training log likelihood did you get with your best learning rate? What training accuracy?
\noindent E. Using the training set standardization values, transform the test set and evaluate the test log likelihood and the test accuracy. What values did you get?
\noindent F. Look at the weights that the model learned. Relate these to the underlying features and explain what you find. Any interesting associations between, e.g., job and bad debt?
\noindent G. Try out a more advanced idea to see if you can improve performance, e.g., add~$L^2$ regularization or introduce some basis functions. Explain what you tried out, why you thought it might work, and whether or not you improved performance.
\end{problem}
\subsection*{Changelog}
\begin{itemize}
\item 21 February 2019 -- Added a bullet to the last question on interpretation.
\item 21 February 2019 -- Initial version.
\end{itemize}
\end{document}