wip: GMLM

This commit is contained in:
Daniel Kapla 2022-12-06 15:15:00 +01:00
parent 2dfdc7083b
commit ed7b913de7
8 changed files with 212 additions and 2757 deletions

View File

@ -65,7 +65,7 @@
\newcommand*{\vech}{\operatorname{vech}}
\newcommand*{\rank}{\operatorname{rank}}
\newcommand*{\diag}{\operatorname{diag}}
\newcommand*{\perm}[1]{\textnormal{Perm}_{#1}} % set of permutations of size #1
\newcommand*{\perm}[1]{\mathfrak{S}_{#1}} % set of permutations of size #1
\newcommand*{\len}[1]{\#{#1}} % length of #1
\DeclareMathOperator*{\ttt}{\circledast}
\DeclareMathOperator{\tr}{tr}
@ -349,12 +349,28 @@ for each $j\in[r]$. The inverse link is given by
\end{displaymath}
consisting of the first and second (uncentered) vectorized moments of the tensor normal distribution.
\begin{align*}
\invlink_1(\mat{\eta}_y) &\equiv \E[\ten{X} \mid Y = y] = \ten{\mu}_y \\
\D b(\mat{\eta}_{y,1}) \equiv \invlink_1(\mat{\eta}_y) &\equiv \E[\ten{X} \mid Y = y] = \ten{\mu}_y \\
&= (\ten{\eta}_1 + \ten{F}_y\times_{k\in[r]}\mat{\alpha}_k) \times_{l\in[k]}\mat{\Omega}_k^{-1} \\
\invlink_2(\mat{\eta}_y) &\equiv \E[\vec(\ten{X})\t{\vec(\ten{X})} \mid Y = y] \\
&= \cov(\vec{X} \mid Y = y) + \vec(\ten{\mu}_y)\t{\vec(\ten{\mu}_y)} \\
\D b(\mat{\eta}_{y,2}) \equiv \invlink_2(\mat{\eta}_y) &\equiv \E[\vec(\ten{X})\t{\vec(\ten{X})} \mid Y = y] \\
&= \cov(\vec{\ten{X}} \mid Y = y) + \vec(\ten{\mu}_y)\t{\vec(\ten{\mu}_y)} \\
&= \bigotimes_{k = r}^{1}\mat{\Omega}_k^{-1} + \vec(\ten{\mu}_y)\t{\vec(\ten{\mu}_y)}
\end{align*}
Furthermore, the explicit parameterization of the second moments
\begin{displaymath}
\cov(\mat{t}(\ten{X})\mid Y = y) = \H b(\mat{\eta}_y)
= \begin{pmatrix}\mat{H}_{1,1} & \mat{H}_{1,2} \\ \mat{H}_{2,1} & \mat{H}_{2,2}\end{pmatrix}
% = \begin{pmatrix}\mat{H}_{1,1} & \mat{H}_{1,2} \\ \mat{H}_{2,1} & \mat{H}_{2,2}\end{pmatrix} = \cov(\mat{t}(\ten{X})\mid Y = y) = \begin{pmatrix}
% \cov(\mat{t}_1(\ten{X})\mid Y = y) & \cov(\mat{t}_1(\ten{X}), \mat{t}_2(\ten{X})\mid Y = y) \\
% \cov(\mat{t}_2(\ten{X}), \mat{t}_1(\ten{X})\mid Y = y) & \cov(\mat{t}_2(\ten{X})\mid Y = y)
% \end{pmatrix}
\end{displaymath}
are given by
\begin{align*}
\mat{H}_{1,1} &= \cov(\vec{\ten{X}}\mid Y = y) = \bigotimes_{k = r}^1 \mat{\Omega}_k^{-1} = \bigotimes_{k = r}^1 \mat{\Delta}_k \\
\mat{H}_{2,1} = \t{\mat{H}_{1,2}} &= \cov(\vec{\ten{X}}\otimes \vec{\ten{X}}, \vec{\ten{X}}\mid Y = y) \\
&=
\end{align*}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Ising Model}
@ -417,7 +433,7 @@ where each individual block is given by
For example $\mathcal{J}_{1,2} = -\frac{\partial l(\Theta)}{\partial\t{(\vec{\overline{\ten{\eta}}_1})}\partial(\vec{\mat{\alpha}_1})}$ and $\mathcal{J}_{2r + 1, 2r + 1} = -\H l(\mat{\Omega}_r)$.
We start by restating the log-likelihood for a given single observation $(\ten{X}, \ten{Y})$ where $\ten{F}_y$ given by
\begin{displaymath}
l(\mat{\Theta}) = \log h(\ten{X}) + c_1\langle\overline{\ten{\eta}}_1 + \ten{F}_{y}\times_{k\in[r]}\mat{\alpha}_k, \ten{X} \rangle + c_2\langle\ten{X}\times_{k\in[r]}\mat{\Omega}_k, \ten{X} \rangle - b(\mat{\eta}_{y})
l(\mat{\Theta}) = \log h(\ten{X}) + c_1\big\langle\overline{\ten{\eta}}_1 + \ten{F}_{y}\mlm{k\in[r]}\mat{\alpha}_k, \ten{X}\big\rangle + c_2\big\langle\ten{X}\mlm{k\in[r]}\mat{\Omega}_k, \ten{X}\big\rangle - b(\mat{\eta}_{y})
\end{displaymath}
with
\begin{align*}
@ -505,7 +521,7 @@ The next step is to identify the Hessians from the second differentials in a sim
&= -c_1 c_2 \t{\Big[ \t{(\ten{H}_{2,1})_{([2r])}} \vec\!\Big(\bigotimes_{k = r}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_j\otimes\bigotimes_{k=j-1}^{1}\mat{\Omega}_k\Big) \Big]} \vec{\d\overline{\ten{\eta}}_1} \\
&= -c_1 c_2 \t{\vec( \ten{R}_{[2r]}(\ten{H}_{2,1}) \mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})}\times_j\t{(\vec{\d\mat{\Omega}_j})} )} \vec{\d\overline{\ten{\eta}}_1} \\
&= -c_1 c_2 \t{(\vec{\d\mat{\Omega}_j})} ( \ten{R}_{[2r]}(\ten{H}_{2,1}) \mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})} )_{(j)} \vec{\d\overline{\ten{\eta}}_1} \\
&\qquad\Rightarrow \frac{\partial l}{\partial(\vec{\mat{\Omega}_j})\t{\partial(\vec{\overline{\ten{\eta}}_1)}}} = -c_1 c_2 ( \ten{R}_{[2r]}(\ten{H}_{2,1}) \mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})} )_{(j)}
&\qquad\Rightarrow \frac{\partial l}{\partial(\vec{\mat{\Omega}_j})\t{\partial(\vec{\overline{\ten{\eta}}_1)}}} = -c_1 c_2 \mat{D}_{p_j}\t{\mat{D}_{p_j}}( \ten{R}_{[2r]}(\ten{H}_{2,1}) \mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})} )_{(j)}
\qquad{\color{gray} (p_j^2 \times p)}
\\
&\d^2 l(\mat{\alpha}_j) \\
@ -541,15 +557,15 @@ The next step is to identify the Hessians from the second differentials in a sim
&= -c_1 c_2 \t{(\vec{\d\mat{\alpha}_j})}\biggl(\t{\Big(\ten{F}_y\mlm{k\neq j}\mat{\alpha}_k\Big)_{(j)}}\otimes\mat{I}_{p_j}\biggr) \mat{K}_{\mat{p},(j)} \vec\Bigl(\ten{R}_{[2r]}(\ten{H}_{2,1})\mlm{k\neq l}\t{(\vec{\mat{\Omega}_k})}\times_l\t{(\vec{\d\mat{\Omega}_l})}\Bigr) \\
&= -c_1 c_2 \t{(\vec{\d\mat{\alpha}_j})}\biggl(\t{\Big(\ten{F}_y\mlm{k\neq j}\mat{\alpha}_k\Big)_{(j)}}\otimes\mat{I}_{p_j}\biggr) \mat{K}_{\mat{p},(j)} \t{\Bigl(\ten{R}_{[2r]}(\ten{H}_{2,1})\mlm{k\neq l}\t{(\vec{\mat{\Omega}_k})}\Bigr)_{([r])}}\vec{\d\mat{\Omega}_l} \\
&= -c_1 c_2 \t{(\vec{\d\mat{\alpha}_j})}\biggl( \Big(\ten{F}_y\mlm{k\neq j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j}^{[r]\backslash j + r} \Bigl(\ten{R}_{[2r]}(\ten{H}_{2,1})\mlm{k\neq l}\t{(\vec{\mat{\Omega}_k})}\Bigr) \biggr)_{(r + 2, 1)} \vec{\d\mat{\Omega}_l} \\
&\qquad\Rightarrow \frac{\partial l}{\partial(\vec{\mat{\alpha}_j})\t{\partial(\vec{\mat{\Omega}_l})}} = -c_1 c_2 \biggl( \Big(\ten{F}_y\mlm{k\neq j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j}^{[r]\backslash j + r} \Bigl(\ten{R}_{[2r]}(\ten{H}_{2,1})\mlm{k\neq l}\t{(\vec{\mat{\Omega}_k})}\Bigr) \biggr)_{(r + 2, 1)}
&\qquad\Rightarrow \frac{\partial l}{\partial(\vec{\mat{\alpha}_j})\t{\partial(\vec{\mat{\Omega}_l})}} = -c_1 c_2 \biggl( \Big(\ten{F}_y\mlm{k\neq j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j}^{[r]\backslash j + r} \Bigl(\ten{R}_{[2r]}(\ten{H}_{2,1})\mlm{k\neq l}\t{(\vec{\mat{\Omega}_k})}\Bigr) \biggr)_{(r + 2, 1)}\mat{D}_{p_l}\t{\mat{D}_{p_l}}
% \qquad {\color{gray} (p_j q_j \times p_l^2)}
\\
&\d^2 l(\mat{\Omega}_j) \\
&= -c_2^2 \t{\vec\Bigl(\bigkron{k = r}{l + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_l\otimes\bigkron{k=l-1}{1}\mat{\Omega}_k\Bigr)} \t{(\ten{H}_{2,2})_{([2r],[2r]+2r)}} \vec\Bigl(\bigkron{k = r}{l + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_l\otimes\bigkron{k=l-1}{1}\mat{\Omega}_k\Bigr) \\
&= -c_2^2 \ten{R}_{[2r],[2r]+2r}(\ten{H}_{2,2})\mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})}\mlm{\substack{k + r\\k\in[r]\backslash j}}\t{(\vec{\mat{\Omega}_k})}\times_j\t{(\vec{\d\mat{\Omega}_j})}\times_{j + r}\t{(\vec{\d\mat{\Omega}_j})} \\
&= -c_2^2 \t{(\vec{\d\mat{\Omega}_j})} \biggl( \ten{R}_{[2r],[2r]+2r}(\ten{H}_{2,2})\mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})}\mlm{\substack{k + r\\k\in[r]\backslash j}}\t{(\vec{\mat{\Omega}_k})} \biggr)_{([r])} \vec{\d\mat{\Omega}_j} \\
&\qquad\Rightarrow \H l(\mat{\Omega}_j) = -c_2^2 \biggl( \ten{R}_{[2r],[2r]+2r}(\ten{H}_{2,2})\mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})}\mlm{\substack{k + r\\k\in[r]\backslash j}}\t{(\vec{\mat{\Omega}_k})} \biggr)_{([r])}
\qquad {\color{gray} (p_j^2 \times p_j^2)}
&\qquad\Rightarrow \H l(\mat{\Omega}_j) = -c_2^2 \mat{D}_{p_j}\t{\mat{D}_{p_j}}\biggl( \ten{R}_{[2r],[2r]+2r}(\ten{H}_{2,2})\mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})}\mlm{\substack{k + r\\k\in[r]\backslash j}}\t{(\vec{\mat{\Omega}_k})} \biggr)_{([r])}\mat{D}_{p_j}\t{\mat{D}_{p_j}}
%\qquad {\color{gray} (p_j^2 \times p_j^2)}
\\
&\d^2 l(\mat{\Omega}_j, \mat{\Omega}_l) \\
&\overset{\makebox[0pt]{\scriptsize $j < l$}}{=} c_2 \langle\ten{X}\times_{k\in[r]\backslash \{j, l\}}\mat{\Omega}_k\times_j\d\mat{\Omega}_j\times_l\d\mat{\Omega}_l, \ten{X}\rangle \\
@ -562,8 +578,8 @@ The next step is to identify the Hessians from the second differentials in a sim
&= c_2 \t{(\vec{\d\mat{\Omega}_j})}\Big((\ten{X}\otimes\ten{X} - \ten{R}_{[2r]}(\ten{D}_2)) \mlm{k\neq j,l}\t{(\vec{\mat{\Omega}_k})} \Big)_{(j)}\vec{\d\mat{\Omega}_l} \\
&\qquad - c_2^2 \t{(\vec{\d\mat{\Omega}_j})}\Big(\ten{R}_{([2r],[2r]+2r)}(\ten{H}_{2,2}) \mlm{k\in [r]\backslash j}\t{(\vec{\mat{\Omega}_k})} \mlm{\substack{k + r \\ k\in [r]\backslash l}}\t{(\vec{\mat{\Omega}_k})}\Big)_{(j)}\vec{\d\mat{\Omega}_l} \\
&\qquad \begin{aligned}\Rightarrow \frac{\partial l}{\partial(\vec{\mat{\Omega}_j})\t{\partial(\vec{\mat{\Omega}_l})}} &=
c_2 \Big((\ten{X}\otimes\ten{X} - \ten{R}_{[2r]}(\ten{D}_2)) \mlm{k\neq j,l}\t{(\vec{\mat{\Omega}_k})} \Big)_{(j)} \\
&\qquad -c_2^2 \Big(\ten{R}_{([2r],[2r]+2r)}(\ten{H}_{2,2}) \mlm{k\in [r]\backslash j}\t{(\vec{\mat{\Omega}_k})} \mlm{\substack{k + r \\ k\in [r]\backslash l}}\t{(\vec{\mat{\Omega}_k})}\Big)_{(j)}
\mat{D}_{p_j}\t{\mat{D}_{p_j}}\Big[c_2\Big((\ten{X}\otimes\ten{X} - \ten{R}_{[2r]}(\ten{D}_2)) \mlm{k\neq j,l}\t{(\vec{\mat{\Omega}_k})} \Big)_{(j)} \\
&\qquad -c_2^2 \Big(\ten{R}_{([2r],[2r]+2r)}(\ten{H}_{2,2}) \mlm{k\in [r]\backslash j}\t{(\vec{\mat{\Omega}_k})} \mlm{\substack{k + r \\ k\in [r]\backslash l}}\t{(\vec{\mat{\Omega}_k})}\Big)_{(j)}\Big]\mat{D}_{p_l}\t{\mat{D}_{p_l}}
% \qquad {\color{gray} (p_j^2 \times p_l^2)}
\end{aligned}
\end{align*}}%
@ -1122,11 +1138,62 @@ For two matrices $\mat A$ of dimensions $a_1\times a_2$ and $\mat B$ of dimensio
\tr(\mat{A} \mat{B} \mat{C} \mat{D}) = \t{(\vec{\t{\mat{B}}})}(\t{\mat{A}}\otimes \mat{C})\vec{\mat{D}}
\end{displaymath}
Let $\ten{A}, \ten{B}$ be two tensors of order $r$, then
\begin{displaymath}
\ten{A}\otimes\ten{B} = (\ten{A}\circ\ten{B})_{((2r - 1, 2r - 3, ..., 1), (2r, 2r - 2, ..., 2))}
\end{displaymath}
where $\circ$ is the outer product. For example considure two matrices $\mat{A}, \mat{B}$, the above simplifies to
\begin{displaymath}
\mat{A}\otimes\mat{B} = (\mat{A}\circ\mat{B})_{((3, 1), (4, 2))}.
\end{displaymath}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Matrix Calculus}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Let $f$ be an $r$ times differentiable function, then
\begin{displaymath}
\d^r f(\mat{X}) = \ten{F}(\mat{X})\mlm{k = 1}{r} \vec{\d\mat{X}}
\qquad\Leftrightarrow\qquad
\D^r f(\mat{X}) \equiv \frac{1}{r!}\sum_{\sigma\in\perm{r}}\ten{F}(\mat{X})_{(\sigma)}
\end{displaymath}
where $\perm{r}$ denotes the set of all permutations of $[r]$.
\begin{example}[Derivative with respect to a symmertic matrix]
Suppose we have a function $\mat{F}(\mat{X})$ where $\mat{X}$ is a symmetric $p\times p$ matrix. We want to find the derivative of $\mat{F}$ with respect to $\mat{X}$ under the symmetry constraint. As we know that for symmetric $\mat{X}$ holds $\vec{X} = \mat{D}_p\vech{X}$ we get in conjunction of $\mat{F}$ being only a function of $\vech{X}$ the differential as
\begin{displaymath}
\d\mat{F}(\mat{X})
= \d\mat{F}(\mat{D}_p\vech{\mat{X}})
= \D\mat{F}(\mat{X})\mat{D}_p\d\vech{\mat{X}}
= \D\mat{F}(\mat{X})\mat{D}_p\D(\vech{\mat{X}})(\mat{X})\vec{\d\mat{X}}.
\end{displaymath}
To find the value of $\D(\vech{\mat{X}})(\mat{X})$ we look at its components
\begin{displaymath}
(\D(\vech{\mat{X}})(\mat{X}))_{i, j} = \frac{(\partial \vech{X})_i}{(\partial \vec{X})_j}.
\end{displaymath}
Via an explicit indexing of the half vectorization and the vectorization operation in reference to the matrix indices of $\mat{X}$ we get with $i(k, l)\in[p(p + 1) / 2]$ and $j(m, n)\in[p^2]$ where $1 \leq l\leq k \leq p$ and $m, n\in[p]$ the mapping
\begin{align*}
i(k, l) &= (k - 1)(p + 1) + 1 - \frac{k(k - 1)}{2} + l - k \\
j(m, n) &= (n - 1)p + n + 1
\end{align*}
that
\begin{displaymath}
\frac{(\partial \vech{X})_{i(k,l)}}{(\partial \vec{X})_{j(m,n)}} = \begin{cases}
1 & \text{iff }(k,l) = (m,n)\text{ or }(k,l) = (n,m) \\
0 & \text{else}
\end{cases}.
\end{displaymath}
This reveals already its explicit value as the transposed dublication matrix $\mat{D}_p$ matches the same pattern, meaning
\begin{displaymath}
\D(\vech{\mat{X}})(\mat{X}) = \t{\mat{D}_p}.
\end{displaymath}
This means that from the differential $\d\mat{F}(\mat{Y}) = \mat{G}(\mat{Y})\vec{\d\mat{Y}}$ where $\mat{Y}$ is an arbitrary square matrix we can identify the derivative of $\mat{F}$ with respect to a symmetric $\mat{X}$ as
\begin{displaymath}
\d\mat{F}(\mat{Y}) = \mat{G}(\mat{Y})\vec{\d\mat{Y}}
\qquad\Rightarrow\qquad
\D\mat{F}(\mat{X}) = \mat{G}(\mat{X})\mat{D}_p\t{\mat{D}_p}\text{ for }\mat{X} = \t{\mat{X}}.
\end{displaymath}
\end{example}
\begin{example}
We want to find the derivative with respect to any of the $r$ symmetric $p_j\times p_j$ matrices $\mat{\Omega}_j$ where $j = 1, ..., r$ of the Kronecker product
@ -1296,5 +1363,47 @@ When sampling from the Multi-Array Normal one way is to sample from the Multi-Va
\end{displaymath}
where the sampling from the standard Multi-Array Normal is done by sampling all of the elements of $\ten{Z}$ from a standard Normal.
\paragraph{Moments of the Multivariate Normal (in matrix notation):}
The moment generating function of the multivariate normal distribution with parameters $\mu, \Sigma$ is
\begin{displaymath}
M(t) = \E e^{\t{t}X} = \exp(\t{t}\mu + \tfrac{1}{2}\t{t}\Sigma t).
\end{displaymath}
The differentials up to the 4'th are
\begin{align*}
\d M(t) &= M(t) \t{(\mu + \Sigma t)} \d{t} \\
\d^2 M(t) &= \t{\d{t}} M(t) (\mu + \Sigma t)\t{(\mu + \Sigma t)} \d{t} \\
\d^3 M(t) &= M(t) (\mu + \Sigma t)\circ [(\mu + \Sigma t)\circ (\mu + \Sigma t) + 3\Sigma]\mlm{k = 1}{3} \d{t} \\
\d^4 M(t) &= M(t) (\mu + \Sigma t)\circ(\mu + \Sigma t)\circ[(\mu + \Sigma t)\circ(\mu + \Sigma t) + 6\Sigma)]\mlm{k = 1}{4} \d{t}
\end{align*}
Using the differentials to derivative identification identity
\begin{displaymath}
\d^m f(t) = \ten{F}(t)\mlm{k = 1}{m}\d{t}
\qquad\Leftrightarrow\qquad
\D^m f(t) \equiv \frac{1}{m!}\sum_{\sigma\in\mathfrak{S}_m}\ten{F}(t)_{(\sigma)}
\end{displaymath}
in conjunction with simplifications gives the first four raw moments by evaluating at zero;
\begin{align*}
M_1 = \D M(t)|_{t = 0} &= \mu \\
M_2 = \D^2 M(t)|_{t = 0} &= \mu\t{\mu} + \Sigma \\
M_3 = \D^3 M(t)|_{t = 0} &= \mu\circ\mu\circ\mu + \mu\circ\Sigma + (\mu\circ\Sigma)_{((2), (1), (3))} + \Sigma\circ\mu \\
M_4 = \D^4 M(t)|_{t = 0} &\equiv \frac{1}{4!}\sum_{\sigma\in\mathfrak{S}_4} (\mu\circ\mu\circ\Sigma + \Sigma\circ\Sigma + \Sigma\circ\mu\circ\mu)_{(\sigma)}
\end{align*}
which leads to the centered moments (which are also the covariances of the sufficient statistic $t(X)$)
\begin{align*}
H_{1,1} &= \cov(t_1(X)\mid Y = y) \\
&= \Sigma \\
H_{1,2} &= \cov(t_1(X), t_2(X)\mid Y = y) \\
&= (\mu\circ\Sigma)_{(3, (1, 2))} + (\mu\circ\Sigma)_{(3, (2, 1))} \\
&= \t{\mu}\otimes\Sigma + \Sigma\otimes\t{\mu} \\
H_{2,1} &= \cov(t_2(X), t_1(X)\mid Y = y) = \t{H_{1, 2}} \\
H_{2,2} &= \cov(t_2(X)\mid Y = y) \\
&= (\mu\circ\mu\circ\Sigma + \Sigma\circ\Sigma + \Sigma\circ\mu\circ\mu)_{((1, 3), (2, 4))} +
(\mu\circ\mu\circ\Sigma + \Sigma\circ\Sigma + \Sigma\circ\mu\circ\mu)_{((1, 3), (4, 2))} \\
&\overset{???}{=} 2\Sigma\otimes\Sigma
+ \mu\otimes\Sigma\otimes\t{\mu}
+ \t{\mu}\otimes\Sigma\otimes\mu
+ \mu\t{\mu}\otimes\Sigma
+ \Sigma\otimes\mu\t{\mu}
\end{align*}
\end{document}

View File

@ -1,656 +0,0 @@
\documentclass[a4paper, 10pt]{article}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{fullpage}
\usepackage{amsmath, amssymb, amstext, amsthm}
\usepackage{bm} % \boldsymbol and italic corrections, ...
\usepackage[pdftex]{hyperref}
\usepackage{makeidx} % Index (Symbols, Names, ...)
\usepackage{xcolor, graphicx} % colors and including images
\usepackage{tikz}
\usetikzlibrary{calc}
\usepackage[
% backend=bibtex,
style=authoryear-comp
]{biblatex}
\usepackage{algorithm, algpseudocode} % Pseudo Codes / Algorithms
% Document meta into
\title{Bernoulli}
\author{Daniel Kapla}
\date{\today}
% Set PDF title, author and creator.
\AtBeginDocument{
\hypersetup{
pdftitle = {Bernoulli},
pdfauthor = {Daniel Kapla},
pdfcreator = {\pdftexbanner}
}
}
\makeindex
% Bibliography resource(s)
\addbibresource{main.bib}
% Setup environments
% Theorem, Lemma
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{example}{Example}
% Definition
\theoremstyle{definition}
\newtheorem{defn}{Definition}
% Remark
\theoremstyle{remark}
\newtheorem{remark}{Remark}
% Define math macros
\newcommand{\mat}[1]{\boldsymbol{#1}}
\newcommand{\ten}[1]{\mathcal{#1}}
\renewcommand{\vec}{\operatorname{vec}}
\newcommand{\dist}{\operatorname{dist}}
\newcommand{\rank}{\operatorname{rank}}
\DeclareMathOperator{\kron}{\otimes} % Kronecker Product
\DeclareMathOperator{\hada}{\odot} % Hadamard Product
\newcommand{\ttm}[1][n]{\times_{#1}} % n-mode product (Tensor Times Matrix)
\DeclareMathOperator{\df}{df}
\DeclareMathOperator{\tr}{tr}
\DeclareMathOperator{\var}{Var}
\DeclareMathOperator{\cov}{Cov}
\DeclareMathOperator{\Span}{Span}
\DeclareMathOperator{\E}{\operatorname{\mathbb{E}}}
% \DeclareMathOperator{\independent}{{\bot\!\!\!\bot}}
\DeclareMathOperator*{\argmin}{{arg\,min}}
\DeclareMathOperator*{\argmax}{{arg\,max}}
\newcommand{\D}{\textnormal{D}} % derivative
\renewcommand{\d}{\textnormal{d}} % differential
\renewcommand{\t}[1]{{#1^{\prime}}} % matrix transpose
\newcommand{\pinv}[1]{{#1^{\dagger}}} % `Moore-Penrose pseudoinverse`
\renewcommand{\vec}{\operatorname{vec}}
\newcommand{\vech}{\operatorname{vech}}
\newcommand{\logical}[1]{{[\![#1]\!]}}
\begin{document}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Bivariate Bernoulli Distribution}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
A random 2-vector $X\in\{0, 1\}^2$ follows a \emph{Bivariate Bernoulli} distribution if its pmf is
\begin{displaymath}
P(X = (x_1, x_2)) = p_{00}^{(1-x_1)(1-x_2)}p_{01}^{(1-x_1)x_2}p_{10}^{x_1(1-x_2)}p_{11}^{x_1x_2}
\end{displaymath}
where $p_{ab} = P(X = (a, b))$ for $a, b\in\{0, 1\}$. An alternative formulation, in terms of log-odds, follows immediately as
\begin{displaymath}
P(X = (x_1, x_2)) = p_{00}\exp\Big(x_1\log\frac{p_{10}}{p_{00}} + x_2\log\frac{p_{01}}{p_{00}} + x_1x_2\log\frac{p_{00}p_{11}}{p_{01}p_{10}}\Big).
\end{displaymath}
Collecting the log-odds in a parameter vector $\mat{\theta} = \t{(\theta_{01}, \theta_{10}, \theta_{11})}$ where
\begin{align*}
\theta_{01} &= \log\frac{p_{01}}{p_{00}}, \\
\theta_{10} &= \log\frac{p_{10}}{p_{00}}, \\
\theta_{11} &= \log\frac{p_{00}p_{11}}{p_{01}p_{10}}
\end{align*}
the pmf can be written more compact as
\begin{displaymath}
P(X = (x_1, x_2)) = P(X = \mat{x}) = p_{00}\exp(\t{\mat{\theta}}\vech(\mat{x}\t{\mat{x}}))
= p_{00}\exp(\t{\mat{x}}\mat{\Theta}\mat{x})
\end{displaymath}
with the parameter matrix $\mat{\Theta}$ defined as
\begin{displaymath}
\mat{\Theta} = \begin{pmatrix}
\theta_{01} & \tfrac{1}{2}\theta_{11} \\
\tfrac{1}{2}\theta_{11} & \theta_{10}
\end{pmatrix} = \begin{pmatrix}
\log\frac{p_{01}}{p_{00}} & \tfrac{1}{2}\log\frac{p_{00}p_{11}}{p_{01}p_{10}} \\
\tfrac{1}{2}\log\frac{p_{00}p_{11}}{p_{01}p_{10}} & \log\frac{p_{10}}{p_{00}}
\end{pmatrix}.
\end{displaymath}
The marginal distribution of $X_1$ and $X_2$ are given by
\begin{align*}
P(X_1 = x_1) &= P(X = (x_1, 0)) + P(X = (x_1, 1)) \\
&= p_{00}^{1-x_1}p_{10}^{x_1} + p_{01}^{1-x_1}p_{11}^{x_1} \\
&= \begin{cases}
p_{00} + p_{01}, & x_1 = 0 \\
p_{01} + p_{11}, & x_1 = 1
\end{cases} \\
&= (p_{00} + p_{01})^{1-x_1}(p_{01} + p_{11})^{x_1}. \\
P(X_2 = x_2) &= (p_{00} + p_{10})^{1-x_2}(p_{10} + p_{11})^{x_2}.
\end{align*}
Furthermore, the conditional distributions are
\begin{align*}
P(X_1 = x_1|X_2 = x_2) = \frac{P(X = (x_1, x_2))}{P(X_2 = x_2)}
\propto \big(p_{00}^{1-x_2}p_{01}^{x_2}\big)^{1-x_1}\big(p_{10}^{1-x_2}p_{11}^{x_2}\big)^{x_1}, \\
P(X_2 = x_2|X_1 = x_1) = \frac{P(X = (x_1, x_2))}{P(X_1 = x_1)}
\propto \big(p_{00}^{1-x_1}p_{10}^{x_1}\big)^{1-x_2}\big(p_{01}^{1-x_1}p_{11}^{x_1}\big)^{x_2}.
\end{align*}
Note that both the marginal and the conditional are again Bernoulli distributed. Its also of interest to look at the covariance between the components of $X$ which are given by
\begin{displaymath}
\cov(X_1, X_2) = \E[(X_1 - \E X_1)(X_2 - \E X_2)] = p_{00}p_{11} - p_{01}p_{10}
\end{displaymath}
which follows by direct computation.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Multivariate Bernoulli Distribution}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
This is a direct generalization of the Bivariate Bernoulli Distribution. Before we start a few notes on notation. Let $a, b$ be binary vectors, then $\logical{a = b} = 1$ if and only if $\forall i : a_i = b_i$ and zero otherwise. With that, let $Y\in\{0, 1\}^q$ be a $q$-dimensional \emph{Multivariate Bernoulli} random variable with pdf
\begin{equation}\label{eq:mvb_pmf}
P(Y = y) = \prod_{a\in\{0, 1\}^q} p_a^{\logical{y = a}} = p_y.
\end{equation}
The parameters are $2^q$ parameters $p_a$ which are indexed by the event $a\in\{0, 1\}^q$. The ``indexing'' is done by identifying an event $a\in\{0, 1\}^q$ with the corresponding binary number $m$ the event represents. In more detail we equate an event $a\in\{0, 1\}^q$ with a number $m\in[0; 2^q - 1]$ as
\begin{equation}\label{eq:mvb_numbering}
m = m(a) = \sum_{i = 1}^q 2^{q - i}a_i
\end{equation}
which is a one-to-one relation. For example, for $q = 3$ all events are numbered as in Table~\ref{tab:event-to-number}.
\begin{table}[!ht]
\centering
\begin{minipage}{0.8\textwidth}
\centering
\begin{tabular}{c|c}
Event $a$ & Number $m(a)$ \\ \hline
(0, 0, 0) & 0 \\
(0, 0, 1) & 1 \\
(0, 1, 0) & 2 \\
(0, 1, 1) & 3 \\
(1, 0, 0) & 4 \\
(1, 0, 1) & 5 \\
(1, 1, 0) & 6 \\
(1, 1, 1) & 7
\end{tabular}
\caption{\label{tab:event-to-number}\small Event numbering relation for $q = 3$. The events $a$ are all the possible elements of $\{0, 1\}^3$ and the numbers $m$ range from $0$ to $2^3 - 1 = 7$.}
\end{minipage}
\end{table}
\subsection{Exponential Family and Natural Parameters}
The Multivariate Bernoulli is a member of the exponential family. This can be seen by rewriting the pmf \eqref{eq:mvb_pmf} in terms of an exponential family. First, we take a look at $\logical{y = a}$ for two binary vectors $y, a$ which can be written as
\begin{align*}
\logical{y = a}
&= \prod_{i = 1}^q (y_i a_i + (1 - y_i)(1 - a_i))
= \prod_{i = 1}^q (y_i (2 a_i - 1) + (1 - a_i)) \\
&= \sum_{b\in\{0, 1\}^q}\prod_{i = 1}^q [y_i (2 a_i - 1)]^{b_i}(1 - a_i)^{1-b_i} \\
\intertext{where the last equality follows by multiplying it out similar to the binomial theorem. Note that the inner product is zero if there is at least one $i$ such that $a_i = 1$ but $b_i = 0$, cause then $(1 - a_i)^{1-b_i} = 0$ and $1$ in all other cases. Therefore, using $\logical{a \leq b}$ gives}
...
&= \sum_{b\in\{0, 1\}^q}\logical{a\leq b}\prod_{i = 1}^q [y_i (2 a_i - 1)]^{b_i}
\intertext{Next, given $\logical{a \leq b}$ we get that $\prod_{i = 1}^q (2 a_i - 1)^{b_i} = (-1)^\logical{|b| \equiv_2 |a|}$ by counting the number of zeros in $a$ where at the same index $b$ is one. Cause $(2 a_i - 1)^{b_i} = -1$ for $a_i = 0$ and $b_i = 1$ and $1$ in every other case. This is encoded in $|b| \equiv_2 |a|$ as this is true if there are even number of $a_i = 0$ and $b_i = 1$ cases and false otherwise. This leads to the final version of the rewriting of $\logical{y = a}$ as
}
...
&= \sum_{b\in\{0, 1\}^q}\logical{a\leq b}(-1)^\logical{|b|\equiv_2|a|}\prod_{i = 1}^q y_i^{b_i}.
\end{align*}
Now, taking the log of \eqref{eq:mvb_pmf} and substituting $\logical{y = a}$ gives
\begin{align*}
\log P(Y = y)
&= \sum_{a\in\{0, 1\}^q}\logical{y = a}\log p_a \\
&= \sum_{b\in\{0, 1\}^q}\sum_{a\in\{0, 1\}^q}\log(p_a)\logical{a\leq b}(-1)^\logical{|b|\equiv_2|a|}\prod_{i = 1}^q y_i^{b_i} \\
&= \sum_{b\in\{0, 1\}^q}\left(\prod_{i = 1}^q y_i^{b_i}\right)\sum_{a\leq b}\log(p_a)(-1)^\logical{|b|\equiv_2|a|} \\
&= \sum_{b\in\{0, 1\}^q}\left(\prod_{i = 1}^q y_i^{b_i}\right)\log\frac{\prod_{a\leq b, |a|\equiv_2|b|}p_a}{\prod_{a\leq b, |a|\not\equiv_2|b|}p_a}
\end{align*}
For each $b\in\{0, 1\}^q$ except for $b = (0, ..., 0)$ define
\begin{displaymath}
\theta_{m(b)} = \theta_b = \log\frac{\prod_{a\leq b, |a|\equiv_2|b|}p_a}{\prod_{a\leq b, |a|\not\equiv_2|b|}p_a}, \qquad b\in\{0, 1\}^q\backslash\{0\}^q
\end{displaymath}
and collect the thetas in a combined vetor $\mat{\theta} = (\theta_1, \theta_2, ..., \theta_{2^q - 1})$ where we used the Bernoulli event to number identification of \eqref{eq:mvb_numbering}. The zero event is excluded here as casue its not needed. The reason therefore is that its already determined by all the other parameters and will be incorporated as the pmf scaling factor in the exponential family representation. Using the newly defined $\mat{\theta}$ we get
\begin{align*}
P(Y = y) &= \exp\log P(Y = y)
= \exp\left(\sum_{b\in\{0, 1\}^q}\left(\prod_{i = 1}^q y_i^{b_i}\right)\log\frac{\prod_{a\leq b, |a|\equiv_2|b|}p_a}{\prod_{a\leq b, |a|\not\equiv_2|b|}p_a}\right) \\
&= p_0\exp\left(\sum_{b\in\{0, 1\}^q\backslash\{0\}^q}\left(\prod_{i = 1}^q y_i^{b_i}\right)\theta_b\right)
\end{align*}
The final step is to determin an representation of $p_0$ is terms of $\mat{\theta}$. But this follows simply by the fact that probabilities sum to $1$.
\begin{align*}
1 &= \sum_{y\in\{0, 1\}^q}P(Y = y) = p_0\left(1 + \sum_{y\in\{0, 1\}^q\backslash\{0\}^q}\exp\left(\sum_{b\in\{0, 1\}^q\backslash\{0\}^q}\left(\prod_{i = 1}^q y_i^{b_i}\right)\theta_b\right)\right), \\
p_0(\mat{\theta}) &= \left(1 + \sum_{y\in\{0, 1\}^q\backslash\{0\}^q}\exp\left(\sum_{b\in\{0, 1\}^q\backslash\{0\}^q}\left(\prod_{i = 1}^q y_i^{b_i}\right)\theta_b\right)\right)^{-1}.
\end{align*}
This gives the pmf representation as
\begin{equation}\label{eq:mvb_exp_fam}
P(Y = y) = p_0(\mat{\theta})\exp\left(\sum_{b\in\{0, 1\}^q\backslash\{0\}^q}\left(\prod_{i = 1}^q y_i^{b_i}\right)\theta_b\right)
= p_0(\mat{\theta})\exp(\t{T(y)}\mat{\theta})
\end{equation}
which proves the fact that the Multivariate Bernoulli is a member of the exponential family. Furthermore, the statistic $T(y)$ in \eqref{eq:mvb_exp_fam} is
\begin{displaymath}
T(y) = \left(\prod_{i = 1}^q y_i^{b_i}\right)_{b\in\{0, 1\}^q\backslash\{0\}^q}
\end{displaymath}
which is a $2^q - 1$ dimensional binary vector.
\subsection{Expectation and Covariance}
First the expectation of a Multivariate Bernoulli $Y\sim\mathcal{B}_p$ is given by
\begin{displaymath}
(\E Y)_j = (\E Y_j) = \sum_{\substack{y\in\{0, 1\}^q}} P(Y = y)y_j = \sum_{\substack{y\in\{0, 1\}^q\\y_j = 1}}P(Y = y)
\end{displaymath}
for each of the $j = 1, ..., q$ components of the $q$-dimensional random vector $Y$. Its covariance is similar given by
\begin{displaymath}
\cov(Y_i, Y_j)
= \E(Y_i - \E Y_i)(Y_j - \E Y_j)
= \E Y_i Y_j - (\E Y_i)(\E Y_j)
= \sum_{\substack{y\in\{0, 1\}^q\\y_i = y_j = 1}}P(Y = y) - (\E Y_i)(\E Y_j).
\end{displaymath}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{The Ising Model}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The Ising model is a special case of the Mutlivariate Bernoulli with pmf defined directly in its exponential family form by
\begin{displaymath}
P_{\mat{\theta}}(Y = \mat{y}) = p_0(\mat{\theta})\exp\left(\sum_{i = 1}^q \theta_{\iota(i, i)}y_i + \sum_{i = 1}^{q - 1} \sum_{j = i + 1}^q \theta_{\iota(i, j)}y_i y_j\right).
\end{displaymath}
This ``constraint'' model only considures two way interactions with the natural parameters $\mat{\theta}$ of size $q(q + 1)/2$. The indexing function $\iota$ maps the vector indices to the corresponding parameter indices
\begin{align*}
\iota(i, j) &= \iota_0(\min(i, j) - 1, \max(i, j) - 1) + 1, & i, j &= 1, ..., q \\
\intertext{with the $0$-indexed mapping}
\iota_0(i, j) &= \frac{i (2 q + 1 - i)}{2} + (j - i) & i, j &= 0, ..., q - 1\text{ and }i\leq j.
\end{align*}
This index mapping is constructed such that the half vectorization of the outer product $\mat{y}\t{\mat{y}}$ corresponds in its singe and two way interactions between components to the appropriate parameter indices in $\mat{\theta}$. In other words, above pmf can be written as
\begin{displaymath}
P_{\mat{\theta}}(Y = \mat{y}) = p_0(\mat{\theta})\exp( \t{\vech(\mat{y}\t{\mat{y}})}\mat{\theta} ).
\end{displaymath}
The scaling factor $p_0(\mat{\theta})$ (which is also the probability of the zero event, therefore the name) is
\begin{displaymath}
p_0(\mat{\theta}) = \Big( \sum_{y\in\{0, 1\}^q} \exp( \t{\vech(\mat{y}\t{\mat{y}})}\mat{\theta} ) \Big)^{-1}.
\end{displaymath}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Conditional Distribution}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
{\color{red} TODO: Fix this, its wrong!!!
For the conditional distribution under the Ising model let $I\subsetneq[q]$ be a non-empty index set (non-empty cause this corresponds to no conditioning and not equal to avoid $P_{\mat{\theta}}(\emptyset|Y = \mat{y})$). Then denote with $\mat{y}_I$ the $|I|$ sub-vector of $\mat{y}$ consisting only of the indices in $I$ while $\mat{y}_{-I}$ is the $q - |I|$ vector with all indices \emph{not} in $I$.
\begin{displaymath}
P_{\mat{\theta}}(Y_{I} = \mat{y}_{I} | Y_{-I} = \mat{y}_{-I})
= \frac{P_{\mat{\theta}}(Y = \mat{y})}{P_{\mat{\theta}}(Y_{-I} = \mat{y}_{-I})}
\propto (\mat{a}\mapsto P_{\mat{\theta}}(Y_{I} = \mat{a}, Y_{-I} = \mat{y}_{-I}))|_{\mat{a} = \mat{y}_I}
\end{displaymath}
now noting that
\begin{displaymath}
\vech(\mat{y}\t{\mat{y}}) = \vech\Big(\mat{y}\t{\big(\mat{y} - \sum_{i\in I}y_i\mat{e}_i\big)}\Big)
+ \vech\Big(\mat{y}\t{\big(\sum_{i\in I}y_i\mat{e}_i\big)}\Big)
\end{displaymath}
with the left summand depending only on $\mat{y}_{-I}$ due to the half vectorization only considureing the lower triangular part and the main diagonal. By substitution into the conditional probability we get
\begin{align*}
P_{\mat{\theta}}(Y_{I} = \mat{y}_{I} | Y_{-I} = \mat{y}_{-I})
&\propto \exp( \t{\vech(\mat{y}\t{\mat{y}})}\mat{\theta} ) \\
&= \exp\Big( \t{\vech\Big(\mat{y}\t{\big(\mat{y} - \sum_{i\in I}y_i\mat{e}_i\big)}\Big)}\mat{\theta} \Big)
\exp\Big( \t{\vech\Big(\mat{y}\t{\big(\sum_{i\in I}y_i\mat{e}_i\big)}\Big)}\mat{\theta} \Big) \\
&\propto \exp\Big( \t{\vech\Big(\mat{y}\t{\big(\sum_{i\in I}y_i\mat{e}_i\big)}\Big)}\mat{\theta} \Big) \\
&= \prod_{i\in I}\exp( \t{\vech(\mat{y}\t{\mat{e}_i})}\mat{\theta} )^{y_i}
\end{align*}
leading to the scaled form
\begin{displaymath}
P_{\mat{\theta}}(Y_{I} = \mat{y}_{I} | Y_{-I} = \mat{y}_{-I})
= p_0(\mat{\theta} | Y_{-I} = \mat{y}_{-I})\prod_{i\in I}\exp( \t{\vech(\mat{y}\t{\mat{e}_i})}\mat{\theta} )^{y_i}
\end{displaymath}
with
\begin{displaymath}
p_0(\mat{\theta} | Y_{-I} = \mat{y}_{-I})
= P_{\mat{\theta}}(Y_{I} = \mat{0} | Y_{-I} = \mat{y}_{-I})
= \Big(\sum_{\substack{\mat{a}\in\{0, 1\}^q\\\mat{a}_{-I} = \mat{y}_{-I}}}
\prod_{i\in I}\exp( \t{\vech(\mat{a}\t{\mat{e}_i})}\mat{\theta} )^{a_i}\Big)^{-1}.
\end{displaymath}
} % end of TODO: Fix this, its wrong!!!
Two special cases are of interest, first the single component case with $I = \{i\}$
\begin{displaymath}
P_{\mat{\theta}}(Y_{i} = {y}_{i} | Y_{-i} = \mat{y}_{-i})
= \frac{\exp( \t{\vech(\mat{y}\t{\mat{e}_i})}\mat{\theta} )^{y_i}}{1 + \exp( \t{\vech(\mat{y}\t{\mat{e}_i})}\mat{\theta} )}
\end{displaymath}
and with $\mat{y}_{-i} = \mat{0}$ we get
\begin{displaymath}
P_{\mat{\theta}}(Y_{i} = {y}_{i} | Y_{-i} = \mat{0})
= \frac{\exp( {\theta}_{\iota(i)} )^{y_i}}{1 + \exp( {\theta}_{\iota(i)} )}
\end{displaymath}
leading to
\begin{displaymath}
\theta_{\iota(i)} = \log\frac{P_{\mat{\theta}}(Y_{i} = 1 | Y_{-i} = \mat{0})}{P_{\mat{\theta}}(Y_{i} = 0 | Y_{-i} = \mat{0})} = \log\frac{P_{\mat{\theta}}(Y_{i} = 1 | Y_{-i} = \mat{0})}{1 - P_{\mat{\theta}}(Y_{i} = 1 | Y_{-i} = \mat{0})}
\end{displaymath}
The second case considures the conditional distribution of two components given all the rest is zero, meaning that $I = \{i, j\}$ and $\mat{y}_{-i,-j} = \mat{0}$ which has the form
\begin{align*}
P_{\mat{\theta}}(Y_{i} = {y}_{i}, Y_{j} = {y}_{j} | Y_{-i,-j} = \mat{0})
&= p_0(\mat{\theta} | Y_{-i,-j} = \mat{0})\exp( \t{\vech(\mat{y}\t{\mat{e}_i})}\mat{\theta} )^{y_i}
\exp( \t{\vech(\mat{y}\t{\mat{e}_j})}\mat{\theta} )^{y_j} \\
&= p_0(\mat{\theta} | Y_{-i,-j} = \mat{0})\exp(y_i\theta_{\iota(i)} + y_j\theta_{\iota(j)} + y_iy_j\theta_{\iota(i,j)})
\end{align*}
By setting the combinations of $y_i, y_j\in\{0, 1\}$ we get that
\begin{align*}
\theta_{\iota(i, j)}
&= \log\frac{P_{\mat{\theta}}(Y_{i} = 0, Y_{j} = 0 | Y_{-i,-j} = \mat{0})P_{\mat{\theta}}(Y_{i} = 1, Y_{j} = 1 | Y_{-i,-j} = \mat{0})}{P_{\mat{\theta}}(Y_{i} = 0, Y_{j} = 1 | Y_{-i,-j} = \mat{0})P_{\mat{\theta}}(Y_{i} = 1, Y_{j} = 0 | Y_{-i,-j} = \mat{0})} \\
&= \log\frac{P_{\mat{\theta}}(Y_i = Y_j = 1 | Y_{-i,-j} = \mat{0})}{(1 - P_{\mat{\theta}}(Y_i = Y_j = 1 | Y_{-i,-j} = \mat{0}))}\frac{(1 - P_{\mat{\theta}}(Y_i = 1 | Y_{-i} = \mat{0})P_{\mat{\theta}}(Y_j = 1 | Y_{-j} = \mat{0}))}{P_{\mat{\theta}}(Y_i = 1 | Y_{-i} = \mat{0})P_{\mat{\theta}}(Y_j = 1 | Y_{-j} = \mat{0})}.
\end{align*}
Note that we have expressed all of the natural parameters $\mat{\theta}$ in terms conditional probabilities. Ether one component is $1$ and the rest is conditioned to be zero of two components are $1$ and the rest is conditional zero. This means that the set of those conditional probabilities is a sufficient statistic. Lets denote the vector of those as $\mat{\pi}$ with the same index mapping as used in $\theta$, more precise the the $q(q + 1) / 2$ dimensional vector $\mat{\pi}$ be defined component wise as
\begin{align*}
{\pi}_{\iota(i)} = {\pi}(\mat{\theta})_{\iota(i)} &= P_{\mat{\theta}}(Y_i = 1 | Y_{-i} = \mat{0}) = \frac{\exp({\theta}_{\iota(i)})}{1 + \exp({\theta}_{\iota(i)})} \\
{\pi}_{\iota(i, j)} = {\pi}(\mat{\theta})_{\iota(i, j)} &= P_{\mat{\theta}}(Y_i = Y_j = 1 | Y_{-i, -j} = \mat{0}) \\
&= \frac{\exp(\theta_{\iota(i)} + \theta_{\iota(j)} + \theta_{\iota(i, j)})}{1 + \exp(\theta_{\iota(i)}) + \exp(\theta_{\iota(j)}) + \exp(\theta_{\iota(i)} + \theta_{\iota(j)} + \theta_{\iota(i, j)})}
\end{align*}
and the component wise inverse relation is given by
\begin{equation}
\begin{aligned}[c]
\theta_{\iota(i)} = \theta(\mat{\pi})_{\iota(i)}
&= \log\frac{\pi_{\iota(i)}}{1 - \pi_{\iota(i)}} \\
\theta_{\iota(i, j)} = \theta(\mat{\pi})_{\iota(i, j)}
&= \log\frac{(1 - \pi_{\iota(i)}\pi_{\iota(j)})\pi_{\iota(i, j)}}{\pi_{\iota(i)}\pi_{\iota(j)}(1 - \pi_{\iota(i, j)})}.
\end{aligned}\label{eq:ising_theta_from_cond_prob}
\end{equation}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Log-Likelihood, Score and Fisher Information}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The log-likelihood for a given dataset $\mat{y}_i$, $n = 1, ..., n$ is
\begin{displaymath}
l(\mat{\theta})
= \log\prod_{i = 1}^n P_{\mat{\theta}}(Y = \mat{y}_i)
= \log\prod_{i = 1}^n p_0(\mat{\theta})\exp( \t{\vech(\mat{y}_i\t{\mat{y}_i})}\mat{\theta} )
= n\log p_0(\mat{\theta}) + \sum_{i = 1}^n \t{\vech(\mat{y}_i\t{\mat{y}_i})}\mat{\theta}.
\end{displaymath}
For computing the Score we first get the differential
\begin{align*}
\d l(\mat{\theta})
&= n p_0(\mat{\theta})^{-1} \d p_0(\mat{\theta}) + \sum_{i = 1}^n \t{\vech(\mat{y}_i\t{\mat{y}_i})}\d\mat{\theta} \\
&= -n p_0(\mat{\theta}) \sum_{y\in\{0, 1\}^q} \exp( \t{\vech(\mat{y}\t{\mat{y}})}\mat{\theta} )\t{\vech(\mat{y}\t{\mat{y}})}\d\mat{\theta} + \sum_{i = 1}^n \t{\vech(\mat{y}_i\t{\mat{y}_i})}\d\mat{\theta}
\end{align*}
leading to the Score
\begin{align}
\nabla_{\mat{\theta}} l
&= -n p_0(\mat{\theta}) \sum_{y\in\{0, 1\}^q} \exp( \t{\vech(\mat{y}\t{\mat{y}})}\mat{\theta} )\vech(\mat{y}\t{\mat{y}}) + \sum_{i = 1}^n \vech(\mat{y}_i\t{\mat{y}_i}) \nonumber \\
&= -n \E_{\mat{\theta}} \vech(Y\t{Y}) + \sum_{i = 1}^n \vech(\mat{y}_i\t{\mat{y}_i}). \label{eq:ising_score}
\end{align}
The second differential of the log-likelihood is
\begin{multline*}
\d^2 l(\mat{\theta})
= n p_0(\mat{\theta})^2 \d\t{\mat{\theta}} \sum_{y\in\{0, 1\}^q} \exp( \t{\vech(\mat{y}\t{\mat{y}})}\mat{\theta} )\vech(\mat{y}\t{\mat{y}}) \sum_{y\in\{0, 1\}^q} \exp( \t{\vech(\mat{y}\t{\mat{y}})}\mat{\theta} )\t{\vech(\mat{y}\t{\mat{y}})}\d\mat{\theta} \\
- n p_0(\mat{\theta}) \sum_{y\in\{0, 1\}^q} \d\t{\mat{\theta}}\exp( \t{\vech(\mat{y}\t{\mat{y}})}\mat{\theta} )\vech(\mat{y}\t{\mat{y}})\t{\vech(\mat{y}\t{\mat{y}})}\d\mat{\theta} + 0
\end{multline*}
leading to the Hessian
\begin{align*}
\nabla^2_{\mat{\theta}} l
&= n (\E_{\mat{\theta}}\vech(Y\t{Y}))\t{(\E_{\mat{\theta}}\vech(Y\t{Y}))} - n \E_{\mat{\theta}}\vech(Y\t{Y})\t{\vech(Y\t{Y})} \\
&= -n \cov_{\mat{\theta}}(\vech(Y\t{Y}), \vech(Y\t{Y})).
\end{align*}
From this the Fisher Information is directly given as
\begin{displaymath}
\mathcal{I}(\mat{\theta}) = -\E_{\mat{\theta}} \nabla^2_{\mat{\theta}} l
= n \cov_{\mat{\theta}}(\vech(Y\t{Y}), \vech(Y\t{Y})).
\end{displaymath}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Estimation}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
For the estimation we use the Fisher scoring algorithm which is an iterative updating algorithm following the updating rule
\begin{displaymath}
\mat{\theta}_{t+1} = \mat{\theta}_{t} + \mathcal{I}(\mat{\theta})^{-1}\nabla_{\mat{\theta}} l(\mat{\theta}).
\end{displaymath}
The base idea behing Fisher scoring is to considure a Taylor expantion of the score
\begin{displaymath}
\nabla_{\mat{\theta}} l(\mat{\theta}^*) \approx \nabla_{\mat{\theta}}l(\mat{\theta}) + \nabla^2_{\mat{\theta}} l(\mat{\theta})(\mat{\theta}^* - \mat{\theta}).
\end{displaymath}
Setting $\mat{\theta}^*$ to the MLE estimate $\widehat{\mat{\theta}}$ we get with $\nabla_{\mat{\theta}} l(\widehat{\mat{\theta}}) = 0$ that
\begin{align*}
0 &\approx \nabla_{\mat{\theta}}l(\mat{\theta}) + \nabla^2_{\mat{\theta}} l(\mat{\theta})(\widehat{\mat{\theta}} - \mat{\theta}) \\
\widehat{\mat{\theta}} &\approx \mat{\theta} - (\nabla^2_{\mat{\theta}} l(\mat{\theta}))^{-1}\nabla_{\mat{\theta}}l(\mat{\theta}).
\end{align*}
Now, replacing the observed information $\nabla^2_{\mat{\theta}} l(\mat{\theta})$ with the Fisher information $\mathcal{I}(\mat{\theta}) = -\E_{\mat{\theta}} \nabla^2_{\mat{\theta}} l(\mat{\theta})$ leads to
\begin{displaymath}
\widehat{\mat{\theta}} \approx \mat{\theta} + \mathcal{I}(\mat{\theta})^{-1}\nabla_{\mat{\theta}}l(\mat{\theta})
\end{displaymath}
which is basically above updating rule.
For an initial estimate $\mat{\theta}_0$ we can evaluate the Score \eqref{eq:ising_score} at the MLE estimate $\widehat{\mat{\theta}}$ leading to
\begin{displaymath}
\E_{\widehat{\mat{\theta}}} \vech(Y\t{Y}) = \frac{1}{n}\sum_{i = 1}^n \vech(\mat{y}_i\t{\mat{y}_i}).
\end{displaymath}
With $\E_{\widehat{\mat{\theta}}} \vech(Y\t{Y})_{\iota(i, j)} = P_{\widehat{\mat{\theta}}}(Y_iY_j = 1)$ we may treat the marginal probabilites to be not that far of the conditional probabilities and set $\mat{\pi}_0 = \E_{\widehat{\mat{\theta}}} \vech(Y\t{Y})$ from which we can compute $\mat{\theta}_0 = \mat{\theta}(\mat{\pi}_0)$ as in \eqref{eq:ising_theta_from_cond_prob}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{The Ising Model with Covariates}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Now assume additional covariates $X\in\mathbb{R}^p$ given to the Multivariate Bernoulli variable $Y\in\{0, 1\}^q$ under the Ising model. There relation is given by assuming that $Y$ follows the Ising model given $X$
\begin{align*}
P_{\mat{\theta}_X}(Y = \mat{y} | X)
&= p_0(\mat{\theta}_X)\exp(\t{\vech(\mat{y} \t{\mat{y}})}\mat{\theta}_X) \\
&= p_0(\mat{\theta}_X)\exp(\t{\mat{y}}\mat{\Theta}_X\mat{y}).
\end{align*}
with $\mat{\Theta}$ beeing a $q\times q$ symmetric matrix such that $\t{\mat{y}}\mat{\Theta}\mat{y} = \t{\vech(\mat{y} \t{\mat{y}})}\mat{\theta}$ for any $\mat{y}$. The explicit relation is
\begin{align*}
\mat{\Theta} &= \tfrac{1}{2}(\mat{1}_q\t{\mat{1}_q} + \mat{I}_q)\odot\vech^{-1}(\mat{\theta}), \\
\mat{\theta} &= \vech((2\mat{1}_q\t{\mat{1}_q} - \mat{I}_q)\odot\mat{\Theta}).
\end{align*}
Assuming for centered $\E X = 0$ (w.l.o.g. cause we can always replace $X$ with $X - \E X$) that the covariate dependent parameters $\mat{\Theta}_X$ relate to $X$ by
\begin{displaymath}
\mat{\Theta}_X = \t{\mat{\alpha}}X\t{X}\mat{\alpha}
\end{displaymath}
for an unconstraint parameter matrix $\mat{\alpha}$ of dimensions $p\times q$ leads to the Ising model with covariates of the form
\begin{displaymath}
P_{\mat{\alpha}}(Y = \mat{y} | X)
= p_0(\mat{\alpha}, X)\exp(\t{\mat{y}}\t{\mat{\alpha}}X\t{X}\mat{\alpha}\mat{y}).
\end{displaymath}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Log-likelihood, Score and Fisher information}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Give a single observation $(\mat{y}, \mat{x})$ the log-likelihood is
\begin{displaymath}
l(\mat{\alpha}) = \log P_{\mat{\alpha}}(Y = \mat{y} \mid X = \mat{x})
= \log p_0(\mat{\alpha}, \mat{x}) + \t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y}.
\end{displaymath}
Before we write the differential of the log-likelihood we take a look at the following
\begin{align*}
\d(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y})
&= \d\tr(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y}) \\
&= 2 \tr(\mat{y}\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\d\mat{\alpha}) \\
&= 2 \t{\vec(\mat{\alpha})}(\mat{y}\t{\mat{y}}\otimes\mat{x}\t{\mat{x}})\vec(\d\mat{\alpha}) \\
\d\log p_0(\mat{\alpha}, \mat{x})
&= -p_0(\mat{\alpha}, \mat{x})\sum_{y\in\{0, 1\}^q}\d\exp(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y}) \\
&= -2p_0(\mat{\alpha}, \mat{x})\sum_{y\in\{0, 1\}^q}\exp(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y})\tr(\mat{y}\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\d\mat{\alpha}) \\
&= -2 \tr(\E_{\mat{\alpha}}[Y\t{Y}\mid X = \mat{x}]\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\d\mat{\alpha}) \\
&= -2 \t{\vec(\mat{\alpha})}(\E_{\mat{\alpha}}[Y\t{Y}\mid X = \mat{x}]\otimes\mat{x}\t{\mat{x}})\vec(\d\mat{\alpha})
\end{align*}
Therefore, the differential of the log-likelihood is
\begin{displaymath}
\d l(\mat{\alpha})
= 2 \tr((\mat{y}\t{\mat{y}} - \E_{\mat{\alpha}}[Y\t{Y}\mid X = \mat{x}])\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\d\mat{\alpha})
\end{displaymath}
or equivalently the Score
\begin{align}
\nabla_{\mat{\alpha}}l
&= 2 \mat{x}\t{\mat{x}}\mat{\alpha}(\mat{y}\t{\mat{y}} - \E_{\mat{\alpha}}[Y\t{Y}\mid X = \mat{x}]) \nonumber \\
&= 2 \mat{x}\t{\mat{x}}\mat{\alpha}\vec^{-1}(\mat{D}_q(\vech(\mat{y}\t{\mat{y}}) - \E_{\mat{\theta}_{\mat{\alpha}}(\mat{x})}[\vech(Y\t{Y})\mid X=\mat{x}])) \label{eq:ising_cond_score} \\
&= 2 \mat{x}\t{\mat{x}}\mat{\alpha}\vec^{-1}(\mat{D}_q\nabla_{\mat{\theta}}l(\mat{\theta}_{\mat{\alpha}}(\mat{x}); \mat{y})) \nonumber
\end{align}
where $\nabla_{\mat{\theta}}l(\mat{\theta}_{\mat{\alpha}}(\mat{x}); \mat{y})$ is the Score \eqref{eq:ising_score} for a single observation and $\mat{D}_q$ is the dublication matrix.
Now we continue with the second-order differential of the log-likelihood, therefore considure
\begin{align*}
\d^2(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y})
&= 2 \tr(\mat{y}\t{\mat{y}}\t{(\d\mat{\alpha})}\mat{x}\t{\mat{x}}\d\mat{\alpha}) + 2\tr(\mat{y}\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\d^2\mat{\alpha}) \\
&= 2\t{\vec(\d\mat{\alpha})}(\mat{y}\t{\mat{y}}\otimes\mat{x}\t{\mat{x}})\vec(\d\mat{\alpha}) + 0.
\end{align*}
The next term is
\begin{align*}
\d^2 \log p_0(\mat{\alpha}, \mat{x})
&= \d\Big( -2p_0(\mat{\alpha}, \mat{x})\sum_{y\in\{0, 1\}^q}\exp(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y})\tr(\mat{y}\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\d\mat{\alpha}) \Big) \\
\intertext{To shorten the expressions let $A_{\mat{y}} = (\mat{y}\t{\mat{y}}\otimes \mat{x}\t{\mat{x}})\vec{\mat{\alpha}}$, then}
\ldots &= \d\Big( -2p_0(\mat{\alpha}, \mat{x})\sum_{y\in\{0, 1\}^q}\exp(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y})\t{A_{\mat{y}}}\vec(\d\mat{\alpha}) \Big) \\
&= -2(\d p_0(\mat{\alpha}, \mat{x}))\sum_{y\in\{0, 1\}^q}\exp(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y})\t{A_{\mat{y}}}\vec(\d\mat{\alpha}) \Big) \\
&\qquad -4 p_0(\mat{\alpha}, \mat{x})\sum_{y\in\{0, 1\}^q}\exp(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y})\t{\vec(\d\mat{\alpha})}A_{\mat{y}}\t{A_{\mat{y}}}\vec(\d\mat{\alpha}) \Big) \\
&\qquad\qquad -2 p_0(\mat{\alpha}, \mat{x})\sum_{y\in\{0, 1\}^q}\exp(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y})\t{\vec(\d\mat{\alpha})}(\mat{y}\t{\mat{y}}\otimes\mat{x}\t{\mat{x}})\vec(\d\mat{\alpha}) \Big) \\
&= 4\t{\vec(\d\mat{\alpha})} \E_{\mat{\alpha}}[A_{Y} \mid X = \mat{x}]\E_{\mat{\alpha}}[\t{A_{Y}} \mid X = \mat{x}] \vec(\d\mat{\alpha}) \\
&\qquad -4\t{\vec(\d\mat{\alpha})} \E_{\mat{\alpha}}[A_{Y}\t{A_{Y}} \mid X = \mat{x}] \vec(\d\mat{\alpha}) \\
&\qquad\qquad -2\t{\vec(\d\mat{\alpha})} \E_{\mat{\alpha}}[Y\t{Y}\otimes \mat{x}\t{\mat{x}} \mid X = \mat{x}] \vec(\d\mat{\alpha}) \\
&= -\t{\vec(\d\mat{\alpha})}( 4\cov_{\mat{\alpha}}(A_{Y} \mid X = \mat{x})
+ 2 \E_{\mat{\alpha}}[Y\t{Y}\otimes \mat{x}\t{\mat{x}} \mid X = \mat{x}]) \vec(\d\mat{\alpha})
\end{align*}
Back substituting to get the second order differential of the log-likelihood yields
\begin{displaymath}
\d^2 l(\mat{\alpha}) = \t{\vec(\d\mat{\alpha})}(
2(\mat{y}\t{\mat{y}} - \E_{\mat{\alpha}}[Y\t{Y} \mid X = \mat{x}])\otimes \mat{x}\t{\mat{x}}
- 4\cov_{\mat{\alpha}}(A_{Y} \mid X = \mat{x})
) \vec(\d\mat{\alpha})
\end{displaymath}
leading to the Hessian of the log-likelihood
\begin{displaymath}
\nabla^2_{\vec{\mat{\alpha}}} l
= 2(\mat{y}\t{\mat{y}} - \E_{\mat{\alpha}}[Y\t{Y} \mid X = \mat{x}])\otimes \mat{x}\t{\mat{x}}
- 4\cov_{\mat{\alpha}}((Y\t{Y}\otimes \mat{x}\t{\mat{x}})\vec{\mat{\alpha}} \mid X = \mat{x}).
\end{displaymath}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Estimation}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
{\color{red}This needs to be figured out how to do this in a good way.}
For the initial value we start by equating the Score \eqref{eq:ising_cond_score} to zero giving us
\begin{displaymath}
\widehat{\E}_{\mat{\theta}_{\mat{\alpha}}(\mat{x})}[\vech(Y\t{Y})\mid X=\mat{x}] = \frac{1}{n}\sum_{i = 1}^n \vech(\mat{y}_i\t{\mat{y}_i})
\end{displaymath}
as an estimate of the marginal probabilities of the singe and two way interaction effects $\E_{\mat{\theta}_{\mat{\alpha}}(\mat{x})}[\vech(Y\t{Y})\mid X=\mat{x}]$. By assuming that the marginal probabilities are similar to the conditional probabilities $\mat{\pi}$ we simply equate them to get an initial estimate for the conditional probabilities
\begin{displaymath}
\widehat{\mat{\pi}}_0 = \frac{1}{n}\sum_{i = 1}^n \vech(\mat{y}_i\t{\mat{y}_i}).
\end{displaymath}
Using the relation \eqref{eq:ising_theta_from_cond_prob} we compute an initial estimate for the natural parameters
\begin{displaymath}
\widehat{\mat{\theta}}_0 = \widehat{\mat{\theta}}(\widehat{\mat{\pi}}_0)
\end{displaymath}
and convert it to the matrix version of the parameters
\begin{displaymath}
\widehat{\mat{\Theta}}_0 = \tfrac{1}{2}(\mat{1}_q\t{\mat{1}_q} + \mat{I}_q)\odot \vech^{-1}(\widehat{\mat{\theta}}_0).
\end{displaymath}
Let $\widehat{\mat{\Sigma}} = \frac{1}{n}\sum_{i = 1}^n \mat{x}_i\t{\mat{x}_i}$ then we get
\begin{displaymath}
\widehat{\mat{\Theta}}_0 = \t{\widehat{\mat{\alpha}}_0}\widehat{\mat{\Sigma}}\widehat{\mat{\alpha}}_0.
\end{displaymath}
Next we define $m = \min(p, q)$ and take an rank $m$ approximation of both $\widehat{\mat{\Theta}}_0$ and $\widehat{\mat{\Sigma}}_0$ via an SVD. These approximations have the form
\begin{align*}
\widehat{\mat{\Theta}}_0 &\approx \mat{U}_{\widehat{\mat{\Theta}}_0} \mat{D}_{\widehat{\mat{\Theta}}_0} \t{\mat{U}_{\widehat{\mat{\Theta}}_0}} \\
\widehat{\mat{\Sigma}}_0 &\approx \mat{U}_{\widehat{\mat{\Sigma}}_0} \mat{D}_{\widehat{\mat{\Sigma}}_0} \t{\mat{U}_{\widehat{\mat{\Sigma}}_0}}
\end{align*}
where $\mat{U}_{\widehat{\mat{\Theta}}_0}$, $\mat{U}_{\widehat{\mat{\Sigma}}_0}$ are semi-orthogonal matrices of dimensions $q\times m$, $p\times m$, respectively. Both the diagonal matrices $\mat{D}_{\widehat{\mat{\Theta}}_0}$, $\mat{D}_{\widehat{\mat{\Theta}}_0}$ have dimensions $m\times m$. Substitution of the approximations into above $\widehat{\mat{\Theta}}_0$ to $\widehat{\mat{\alpha}}_0$ relation yields
\begin{displaymath}
\mat{U}_{\widehat{\mat{\Theta}}_0} \mat{D}_{\widehat{\mat{\Theta}}_0} \t{\mat{U}_{\widehat{\mat{\Theta}}_0}} \approx \t{\widehat{\mat{\alpha}}_0}\mat{U}_{\widehat{\mat{\Sigma}}_0} \mat{D}_{\widehat{\mat{\Sigma}}_0} \t{\mat{U}_{\widehat{\mat{\Sigma}}_0}}\widehat{\mat{\alpha}}_0.
\end{displaymath}
Solving for $\widehat{\mat{\alpha}}_0$ leads to out initial value estimate
\begin{displaymath}
\widehat{\mat{\alpha}}_0 = \mat{U}_{\widehat{\mat{\Sigma}}_0} \mat{D}_{\widehat{\mat{\Sigma}}_0}^{-1/2}\mat{D}_{\widehat{\mat{\Theta}}_0}^{1/2} \t{\mat{U}_{\widehat{\mat{\Theta}}_0}}.
\end{displaymath}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newpage
{\color{red}Some notes}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
With the conditional Fisher information $\mathcal{I}_{Y\mid X = \mat{x}}$ as
\begin{displaymath}
\mathcal{I}_{Y\mid X = \mat{x}}(\vec\mat{\alpha})
= -\E_{\mat{\alpha}}[\nabla^2_{\vec{\mat{\alpha}}} l \mid X = \mat{x}]
= 4\cov_{\mat{\alpha}}((Y\t{Y}\otimes \mat{x}\t{\mat{x}})\vec{\mat{\alpha}} \mid X = \mat{x}).
\end{displaymath}
If we know that $X\sim f_X$ with a pdf (or pmf) $f_X$ for $X$ we get
\begin{displaymath}
\mathcal{I}_{Y|X}(\vec\mat{\alpha}) = \int\mathcal{I}_{Y\mid X = \mat{x}}(\vec\mat{\alpha})f_X(\mat{x})\,\d\mat{x}
= 4\E_{X}\cov_{\mat{\alpha}}((Y\t{Y}\otimes X\t{X})\vec{\mat{\alpha}} \mid X)
\end{displaymath}
by
\begin{displaymath}
f_{\theta}(Y, X) = f_{\theta}(Y\mid X)f_{\theta}(X)
\end{displaymath}
it follows that
\begin{displaymath}
l_{X, Y}(\theta) = \log f_{\theta}(Y, X)
= \log f_{\theta}(Y\mid X) + \log f_{\theta}(X)
= l_{Y\mid X}(\theta) + l_{X}(\theta)
\end{displaymath}
% With the fisher information in general beeing defined (under any distribution) $\mathcal{I}(\theta) = \E_{\theta} \nabla l(\theta)$ with the classical Fisher information under the joint distribution of $X, Y$ to be
% \begin{align*}
% \mathcal{I}_{X, Y}(\theta)
% &= \E_{X, Y}\nabla l_{X, Y}(\theta) \\
% &= \E_{X, Y}\nabla l_{Y\mid X}(\theta) + \E_{X, Y}\nabla l_{X}(\theta) \\
% &= \E_{X}\E_{Y\mid X}[\nabla l_{Y\mid X}(\theta)\mid X] + \E_{X, Y}\nabla l_{X}(\theta) \\
% &= \E_{X}\E_{Y\mid X}[\nabla l_{Y\mid X}(\theta)\mid X] + \E_{X}\E_{Y\mid X}[\nabla l_{X}(\theta)\mid X] \\
% &= \E_{X}\E_{Y\mid X}[\nabla l_{Y\mid X}(\theta)\mid X] + \E_{X}\nabla l_{X}(\theta) \\
% &= \mathcal{I}_{Y\mid X}(\theta) + \mathcal{I}_{X}(\theta)
% \end{align*}
% What happens if the know the conditional distribution $Y\mid X\sim f_{\theta}$ only, meaning we not know the distribution of $X$. Then the log-likelihood for a data set $(y_i, x_i)$ with $i = 1, ..., n$ observations has the form
% \begin{displaymath}
% l_{Y\mid X}(\theta) = \log\prod_{i = 1}^n f_{\theta}(Y = y_i \mid X = x_i)
% = \sum_{i = 1}^n \log f_{\theta}(Y = y_i \mid X = x_i)
% \end{displaymath}
% leading to
% \begin{displaymath}
% \nabla l_{Y\mid X}(\theta) = \sum_{i = 1}^n \nabla \log f_{\theta}(Y = y_i \mid X = x_i)
% \end{displaymath}
\appendix
\section{Notes on Fisher Information}
Let $X\sim f_{\theta}$ be a random variable following a parameterized pdf (of pmf) $f_{\theta}$ with parameter vector $\theta$. Its log-likelihood (on the population, it is itself a random variable) is then
\begin{displaymath}
l(\theta) = \log f_{\theta}(X)
\end{displaymath}
and the Score is defined as the derivative of the log-likelihood
\begin{displaymath}
\nabla l(\theta) = \nabla\log f_{\theta}(X).
\end{displaymath}
The expectation of the Score is
\begin{displaymath}
\E \nabla l(\theta)
= \int \nabla f_{\theta}(x)\log f_{\theta}(x)\,\d x
= \int \nabla f_{\theta}(x)\,\d x
= \nabla \int f_{\theta}(x)\,\d x
= \nabla 1
= 0.
\end{displaymath}
The Fisher information is defined as follows which is identical to the covariance of the Score due to the zero expectation of the Score
\begin{displaymath}
\mathcal{I}(\theta) = \E \nabla l(\theta)\t{\nabla l(\theta)}.
\end{displaymath}
Now assume we have two random variable $X, Y$ and a parameter vector $\theta$, then the joint distributed relates to the conditional and the marginal distribution by
\begin{displaymath}
f_{\theta}(X, Y) = f_{\theta}(Y\mid X)f_{\theta}(X)
\end{displaymath}
leading to the log-likelihood
\begin{displaymath}
l_{X, Y}(\theta) = \log f_{\theta}(X, Y) = \log f_{\theta}(Y\mid X) + \log f_{\theta}(X)
= l_{Y\mid X}(\theta) + l_{X}(\theta).
\end{displaymath}
The Score relates identical due to the linearity of differentiation
\begin{displaymath}
\nabla l_{X, Y}(\theta)
= \nabla l_{Y\mid X}(\theta) + \nabla l_{X}(\theta)
\end{displaymath}
but for the Fisher Information its (due to a different argument) the same cause
\begin{align*}
\mathcal{I}_{X, Y}(\theta)
&= \E_{X, Y}\nabla l_{X, Y}(\theta)\t{\nabla l_{X, Y}(\theta)} \\
&= \E_{X, Y}(\nabla l_{Y\mid X}(\theta) + \nabla l_{X}(\theta))\t{(\nabla l_{Y\mid X}(\theta) + \nabla l_{X}(\theta))} \\
&= \E_{X, Y}\nabla l_{Y\mid X}(\theta)\t{\nabla l_{Y\mid X}(\theta)}
+ \E_{X, Y}\nabla l_{Y\mid X}(\theta)\t{\nabla l_{X}(\theta)} \\
&\qquad + \E_{X, Y}\nabla l_{X}(\theta)\t{\nabla l_{Y\mid X}(\theta)}
+ \E_{X, Y}\nabla l_{X}(\theta)\t{\nabla l_{X}(\theta)} \\
&= \E_{X, Y}\nabla l_{Y\mid X}(\theta)\t{\nabla l_{Y\mid X}(\theta)}
+ \E_{X, Y}\nabla l_{X}(\theta)\t{\nabla l_{X}(\theta)}
\end{align*}
where the last equality is due to
\begin{displaymath}
\E_{X, Y}\nabla l_{Y\mid X}(\theta)\t{\nabla l_{X}(\theta)}
= \E_{X}\E_{Y\mid X}[\nabla l_{Y\mid X}(\theta)\mid X]\t{\nabla l_{X}(\theta)}
= 0
\end{displaymath}
using the $\E_{Y\mid X}[\nabla l_{Y\mid X}(\theta)\mid X] = 0$ as the expectation of the Score. The second term is identical and therefore we get
\begin{align*}
\mathcal{I}_{X, Y}(\theta)
&= \E_{X, Y}\nabla l_{Y\mid X}(\theta)\t{\nabla l_{Y\mid X}(\theta)}
+ \E_{X, Y}\nabla l_{X}(\theta)\t{\nabla l_{X}(\theta)} \\
&= \E_{X}\E_{Y\mid X}[\nabla l_{Y\mid X}(\theta)\t{\nabla l_{Y\mid X}(\theta)} \mid X]
+ \E_{X}\nabla l_{X}(\theta)\t{\nabla l_{X}(\theta)} \\
&= \mathcal{I}_{Y\mid X}(\theta) + \mathcal{I}_{X}(\theta).
\end{align*}
Note the conditional Fisher Information which has the form
\begin{displaymath}
\mathcal{I}_{Y\mid X}(\theta)
= \E_{X}\E_{Y\mid X}[\nabla l_{Y\mid X}(\theta)\t{\nabla l_{Y\mid X}(\theta)} \mid X]
= \int \mathcal{I}_{Y\mid X = x}(\theta)f_X(x)\,\d x
\end{displaymath}
Furthermore, in the case that the distribution of $X$ does not depend on $\theta$, meaning $f_{\theta}(X) = f(X)$, then $\mathcal{I}_X(\theta) = 0$ and $\mathcal{I}_{X, Y}(\theta) = \mathcal{I}_{Y \mid X}(\theta)$.
\end{document}

View File

@ -1,815 +0,0 @@
\documentclass[a4paper, 10pt]{article}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{fullpage}
\usepackage{amsmath, amssymb, amstext, amsthm}
\usepackage{bm} % \boldsymbol and italic corrections, ...
\usepackage[pdftex]{hyperref}
\usepackage{makeidx} % Index (Symbols, Names, ...)
\usepackage{xcolor, graphicx} % colors and including images
\usepackage{tikz}
\usepackage[
% backend=bibtex,
style=authoryear-comp
]{biblatex}
% Document meta into
\title{Derivation of Gradient Descent Algorithm for K-PIR}
\author{Daniel Kapla}
\date{November 24, 2021}
% Set PDF title, author and creator.
\AtBeginDocument{
\hypersetup{
pdftitle = {Derivation of Gradient Descent Algorithm for K-PIR},
pdfauthor = {Daniel Kapla},
pdfcreator = {\pdftexbanner}
}
}
\makeindex
% Bibliography resource(s)
\addbibresource{main.bib}
% Setup environments
% Theorem, Lemma
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{example}{Example}
% Definition
\theoremstyle{definition}
\newtheorem{defn}{Definition}
% Remark
\theoremstyle{remark}
\newtheorem{remark}{Remark}
% Define math macros
\newcommand{\mat}[1]{\boldsymbol{#1}}
\newcommand{\ten}[1]{\mathcal{#1}}
\renewcommand{\vec}{\operatorname{vec}}
\newcommand{\dist}{\operatorname{dist}}
\DeclareMathOperator{\kron}{\otimes} % Kronecker Product
\DeclareMathOperator{\hada}{\odot} % Hadamard Product
\newcommand{\ttm}[1][n]{\times_{#1}} % n-mode product (Tensor Times Matrix)
\DeclareMathOperator{\df}{\operatorname{df}}
\DeclareMathOperator{\tr}{\operatorname{tr}}
\DeclareMathOperator{\var}{Var}
\DeclareMathOperator{\cov}{Cov}
\DeclareMathOperator{\E}{\operatorname{\mathbb{E}}}
% \DeclareMathOperator{\independent}{{\bot\!\!\!\bot}}
\DeclareMathOperator*{\argmin}{{arg\,min}}
\DeclareMathOperator*{\argmax}{{arg\,max}}
\newcommand{\D}{\textnormal{D}}
\renewcommand{\d}{\textnormal{d}}
\renewcommand{\t}[1]{{#1^{\prime}}}
\newcommand{\pinv}[1]{{#1^{\dagger}}} % `Moore-Penrose pseudoinverse`
\newcommand{\todo}[1]{{\color{red}TODO: #1}}
% \DeclareFontFamily{U}{mathx}{\hyphenchar\font45}
% \DeclareFontShape{U}{mathx}{m}{n}{
% <5> <6> <7> <8> <9> <10>
% <10.95> <12> <14.4> <17.28> <20.74> <24.88>
% mathx10
% }{}
% \DeclareSymbolFont{mathx}{U}{mathx}{m}{n}
% \DeclareMathSymbol{\bigtimes}{1}{mathx}{"91}
\begin{document}
\maketitle
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% Introduction %%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Notation}
We start with a brief summary of the used notation.
\todo{write this}
Let $\ten{A}$ be a multi-dimensional array of order (rank) $r$ with dimensions $p_1\times ... \times p_r$ and the matrices $\mat{B}_i$ of dimensions $q_i\times p_i$ for $i = 1, ..., r$, then
\begin{displaymath}
\ten{A} \ttm[1] \mat{B}_1 \ttm[2] \ldots \ttm[r] \mat{B}_r
= \ten{A}\times\{ \mat{B}_1, ..., \mat{B}_r \}
= \ten{A}\times_{i\in[r]} \mat{B}_i
= (\ten{A}\times_{i\in[r]\backslash j} \mat{B}_i)\ttm[j]\mat{B}_j
\end{displaymath}
As an alternative example consider
\begin{displaymath}
\ten{A}\times_2\mat{B}_2\times_3\mat{B}_3 = \ten{A}\times\{ \mat{I}, \mat{B}_2, \mat{B}_3 \} = \ten{A}\times_{i\in\{2, 3\}}\mat{B}_i
\end{displaymath}
Another example
\begin{displaymath}
\mat{B}\mat{A}\t{\mat{C}} = \mat{A}\times_1\mat{B}\times_2\mat{C}
= \mat{A}\times\{\mat{B}, \mat{C}\}
\end{displaymath}
\begin{displaymath}
(\ten{A}\ttm[i]\mat{B})_{(i)} = \mat{B}\ten{A}_{(i)}
\end{displaymath}
\todo{continue}
\section{Tensor Normal Distribution}
Let $\ten{X}$ be a multi-dimensional array random variable of order (rank) $r$ with dimensions $p_1\times ... \times p_r$ written as
\begin{displaymath}
\ten{X}\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r).
\end{displaymath}
Its density is given by
\begin{displaymath}
f(\ten{X}) = \Big( \prod_{i = 1}^r \sqrt{(2\pi)^{p_i}|\mat{\Delta}_i|^{p_{-i}}} \Big)^{-1}
\exp\!\left( -\frac{1}{2}\langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle \right)
\end{displaymath}
where $p_{\lnot i} = \prod_{j \neq i}p_j$. This is equivalent to the vectorized $\vec\ten{X}$ following a Multi-Variate Normal distribution
\begin{displaymath}
\vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1)
\end{displaymath}
with $p = \prod_{i = 1}^r p_i$.
\begin{theorem}[Tensor Normal to Multi-Variate Normal equivalence]
For a multi-dimensional random variable $\ten{X}$ of order $r$ with dimensions $p_1\times ..., p_r$. Let $\ten{\mu}$ be the mean of the same order and dimensions as $\ten{X}$ and the mode covariance matrices $\mat{\Delta}_i$ of dimensions $p_i\times p_i$ for $i = 1, ..., n$. Then the tensor normal distribution is equivalent to the multi-variate normal distribution by the relation
\begin{displaymath}
\ten{X}\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r)
\qquad\Leftrightarrow\qquad
\vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes ...\otimes \mat{\Delta}_1)
\end{displaymath}
where $p = \prod_{i = 1}^r p_i$.
\end{theorem}
\begin{proof}
A straight forward way is to rewrite the Tensor Normal density as the density of a Multi-Variate Normal distribution depending on the vectorization of $\ten{X}$. First consider
\begin{align*}
\langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle
&= \t{\vec(\ten{X} - \mu)}\vec((\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\}) \\
&= \t{\vec(\ten{X} - \mu)}(\mat{\Delta}_r^{-1}\otimes ...\otimes\mat{\Delta}_1^{-1})\vec(\ten{X} - \mu) \\
&= \t{(\vec\ten{X} - \vec\mu)}(\mat{\Delta}_r\otimes ...\otimes\mat{\Delta}_1)^{-1}(\vec\ten{X} - \vec\mu).
\end{align*}
Next, using a property of the determinant of a Kronecker product $|\mat{\Delta}_1\otimes\mat{\Delta}_2| = |\mat{\Delta}_1|^{p_2}|\mat{\Delta}_2|^{p_1}$ yields
\begin{displaymath}
|\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1|
= |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_2|^{p_1}|\mat{\Delta}_1|^{p_{\lnot 1}}
\end{displaymath}
where $p_{\lnot i} = \prod_{j \neq i}p_j$. By induction over $r$ the relation
\begin{displaymath}
|\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1|
= \prod_{i = 1}^r |\mat{\Delta}_i|^{p_{\lnot i}}
\end{displaymath}
holds for arbitrary order $r$. Substituting into the Tensor Normal density leads to
\begin{align*}
f(\ten{X}) = \Big( (2\pi)^p |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1| \Big)^{-1/2}
\exp\!\left( -\frac{1}{2}\t{(\vec\ten{X} - \vec\mu)}(\mat{\Delta}_r\otimes ...\otimes\mat{\Delta}_1)^{-1}(\vec\ten{X} - \vec\mu) \right)
\end{align*}
which is the Multi-Variate Normal density of the $p$ dimensional vector $\vec\ten{X}$.
\end{proof}
When sampling from the Multi-Array Normal one way is to sample from the Multi-Variate Normal and then reshaping the result, but this is usually very inefficient because it requires to store the multi-variate covariance matrix which is very big. Instead, it is more efficient to sample $\ten{Z}$ as a tensor of the same shape as $\ten{X}$ with standard normal entries and then transform the $\ten{Z}$ to follow the Multi-Array Normal as follows
\begin{displaymath}
\ten{Z}\sim\mathcal{TN}(0, \mat{I}_{p_1}, ..., \mat{I}_{p_r})
\quad\Rightarrow\quad
\ten{X} = \ten{Z}\times\{\mat{\Delta}_1^{1/2}, ..., \mat{\Delta}_r^{1/2}\} + \mu\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r).
\end{displaymath}
where the sampling from the standard Multi-Array Normal is done by sampling all of the elements of $\ten{Z}$ from a standard Normal.
\section{Introduction}
We assume the model
\begin{displaymath}
\mat{X} = \mat{\mu} + \mat{\beta}\mat{f}_y \t{\mat{\alpha}} + \mat{\epsilon}
\end{displaymath}
where the dimensions of all the components are listed in Table~\ref{tab:dimensions}.
and its vectorized form
\begin{displaymath}
\vec\mat{X} = \vec\mat{\mu} + (\mat{\alpha}\kron\mat{\beta})\vec\mat{f}_y + \vec\mat{\epsilon}
\end{displaymath}
\begin{table}[!htp]
\centering
\begin{minipage}{0.8\textwidth}
\centering
\begin{tabular}{l l}
$\mat X, \mat\mu, \mat R, \mat\epsilon$ & $p\times q$ \\
$\mat{f}_y$ & $k\times r$ \\
$\mat\alpha$ & $q\times r$ \\
$\mat\beta$ & $p\times k$ \\
$\mat\Delta$ & $p q\times p q$ \\
$\mat\Delta_1$ & $q\times q$ \\
$\mat\Delta_2$ & $p\times p$ \\