% No 'submit' option for the problems by themselves.
\documentclass{cos302}
% Use the 'submit' option when you submit your solutions.
%\documentclass[submit]{cos302}
% Put in your full name and email address.
\name{Your Name}
\email{email@princeton.edu}
\discussants{Marie Curie \\ Leonhard Euler}
% You don't need to change these.
\course{COS302-F22}
\assignment{Assignment \#6}
\duedate{6:00pm Wednesday 2 November 2022}
\dropboxurl{https://www.gradescope.com/courses/438130/assignments/2352912}
\usepackage{bm} % for bold math
\usepackage{mathtools} % colon-equals
\usepackage{multirow}
\begin{document}
% IMPORTANT: Uncomment the \documentclass command above with the [submit] option and put your name where it says ``your name''.
\begin{center}
Assignments in COS 302 should be done individually. See the \href{https://www.cs.princeton.edu/courses/archive/fall22/cos302/files/syllabus.pdf}{course syllabus} for the collaboration policy.
\vspace{\baselineskip}
\textcolor{red}{%
Remember to append your Colab PDF as explained in the first homework, with all outputs visible.\\
When you print to PDF it may be helpful to scale at 95\% or so to get everything on the page.}
\end{center}
\begin{problem}[15pts]
Consider a sample space~$\Omega=\{\text{red}, \text{green}, \text{blue}, \text{orange}, \text{yellow}\}$.
\begin{enumerate}[(A)]
\item What is the smallest possible valid event space $\mathcal{A}$?
\item What is the smallest possible event space that contains the set $\{\text{blue}\}$?
\item What is the smallest possible event space that contains both~$\{\text{blue}\}$ and~$\{\text{red},\text{green}\}$?\\ (Hint: it has eight members.)
\end{enumerate}
\end{problem}
\newpage
\begin{problem}[15pts]
Consider the following bivariate distribution $p(x, y)$ of two discrete random variables $X$ and $Y$.
\begin{center}
\begin{tabular}{c c | l | l | l | l | l |}
\cline{3-7}
\multirow{4}{*}{$Y$} & $y_1$ & 0.01 & 0.02 & 0.03 & 0.1 & 0.1 \\ \cline{3-7}
& $y_2$ & 0.05 & 0.1 & 0.05 & 0.07 & 0.2 \\\cline{3-7}
& $y_3$ & 0.1 & 0.05 & 0.03 & 0.05 & 0.04\\\cline{3-7}
& \multicolumn{1}{c}{}& \multicolumn{1}{c}{$x_1$} & \multicolumn{1}{c}{$x_2$} & \multicolumn{1}{c}{$x_3$} & \multicolumn{1}{c}{$x_4$} & \multicolumn{1}{c}{$x_5$}\\
& \multicolumn{1}{c}{}& \multicolumn{5}{ c }{$X$}
\end{tabular}
\end{center}
\begin{enumerate}[(A)]
\item What is the marginal distribution $p(x)$?
\item What is the marginal distribution $p(y)$?
\item What is the conditional distribution $p(x \,|\, Y = y_1)$ ?
\item What is the conditional distribution $p(y \,|\, X = x_3)$ ?
\item What is the conditional distribution $p(x \,|\, Y \neq y_1)$ ?
\end{enumerate}
\end{problem}
\newpage
\begin{problem}[20pts]
In 2014-2016, West Africa experienced a massive outbreak of Ebola.
We'll concentrate on Sierra Leone and imagine a mandatory screening of every citizen.
The probability of being tested positive given that the citizen has Ebola is $84\%$.
The probability of being tested positive given that the citizen does not have Ebola is $11\%$.
We also know that the probability of contracting Ebola for any given citizen is $0.4\%$.
If a randomly-chosen citizen tests positive, what is that citizen's probability of actually having Ebola?
\end{problem}
\newpage
\begin{problem}[48pts]
In this problem you will do some mathematical calculations and also use Colab.
Be sure to append your PDF and insert your link as usual.
\begin{enumerate}[(A)]
\item Import \href{https://docs.scipy.org/doc/numpy/reference/random/index.html}{\texttt{numpy.random}} (usually aliased to \texttt{npr}) and \href{https://numpy.org/doc/stable/reference/random/generated/numpy.random.seed.html}{set the random seed}.
\item Imagine drawing 1,000 independent \href{https://en.wikipedia.org/wiki/Bernoulli_distribution}{Bernoulli variates}~$X_i\in\{0,1\}$ with the probability~${p(X_i=1) = 0.35}$, and computing their sum~
\begin{align*}
Y&=\sum_{i=1}^{1000} X_i\,.
\end{align*}
What are the mean and variance of~$Y$?
\item Generate 10,000 independent random variables~$Y$ as described above.
That is, generate 10,000 sums of 1,000 independent Bernoulli variables.
This is not as hard as it sounds.
Use the \href{https://numpy.org/doc/stable/reference/random/generated/numpy.random.rand.html}{\texttt{numpy.random.rand()}} function only; do not use any functions from \href{https://docs.scipy.org/doc/scipy/reference/stats.html}{\texttt{scipy.stats}}.
Use \texttt{rand()} to generate a $10,000\times 1,000$ matrix of independent uniform random variates in the interval~$[0,1]$, then threshold them appropriately to get~$0$ or $1$.
Finally, sum over the appropriate dimension to get 10,000 samples of $Y$ above.
\item Use Matplotlib to \href{https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.hist.html}{make a histogram} of these samples.
Use at least 100 bins so you can see the structure in the distribution.
Do you recognize the shape?
\item Compute the empirical mean and variance of the 10,000 samples you have drawn.
Compare these results to your calculation from (B).
\item Now imagine drawing 1,000 independent (continuous) variates uniformly in the interval $[-1,1]$.
What are the mean and variance of their sum?
\item Generate 10,000 such sums using a variation of the procedure you performed for (C).
As before, you'll only use \texttt{rand()} but you should scale and shift rather than threshold.
\item Create a histogram of these sums, as in (D). Describe the shape.
\item Compute the empirical mean and variance of the 10,000 samples and compare them to your computations in (F).
\end{enumerate}
\end{problem}
\newpage
\begin{problem}[2pts]
Approximately how many hours did this assignment take you to complete?
\end{problem}
My notebook URL:
{\small \url{https://colab.research.google.com/XXXXXXXXXXXXXXXXXXXXXXX}}
\subsection*{Changelog}
\begin{itemize}
\item 12 Oct 2022 -- F22 version.
\end{itemize}
% \includepdf[pages=-]{mynotebook.pdf}
\end{document}