diff --git a/LaTeX/GMLM.tex b/LaTeX/GMLM.tex
index f621ad1..a7502e8 100644
--- a/LaTeX/GMLM.tex
+++ b/LaTeX/GMLM.tex
@@ -65,7 +65,7 @@
 \newcommand*{\vech}{\operatorname{vech}}
 \newcommand*{\rank}{\operatorname{rank}}
 \newcommand*{\diag}{\operatorname{diag}}
-\newcommand*{\perm}[1]{\textnormal{Perm}_{#1}}  % set of permutations of size #1
+\newcommand*{\perm}[1]{\mathfrak{S}_{#1}}       % set of permutations of size #1
 \newcommand*{\len}[1]{\#{#1}}                   % length of #1
 \DeclareMathOperator*{\ttt}{\circledast}
 \DeclareMathOperator{\tr}{tr}
@@ -349,12 +349,28 @@ for each $j\in[r]$. The inverse link is given by
 \end{displaymath}
 consisting of the first and second (uncentered) vectorized moments of the tensor normal distribution.
 \begin{align*}
-    \invlink_1(\mat{\eta}_y) &\equiv \E[\ten{X} \mid Y = y] = \ten{\mu}_y \\
+    \D b(\mat{\eta}_{y,1}) \equiv \invlink_1(\mat{\eta}_y) &\equiv \E[\ten{X} \mid Y = y] = \ten{\mu}_y \\
         &= (\ten{\eta}_1 + \ten{F}_y\times_{k\in[r]}\mat{\alpha}_k) \times_{l\in[k]}\mat{\Omega}_k^{-1} \\
-    \invlink_2(\mat{\eta}_y) &\equiv \E[\vec(\ten{X})\t{\vec(\ten{X})} \mid Y = y] \\
-        &= \cov(\vec{X} \mid Y = y) + \vec(\ten{\mu}_y)\t{\vec(\ten{\mu}_y)} \\
+    \D b(\mat{\eta}_{y,2}) \equiv \invlink_2(\mat{\eta}_y) &\equiv \E[\vec(\ten{X})\t{\vec(\ten{X})} \mid Y = y] \\
+        &= \cov(\vec{\ten{X}} \mid Y = y) + \vec(\ten{\mu}_y)\t{\vec(\ten{\mu}_y)} \\
         &= \bigotimes_{k = r}^{1}\mat{\Omega}_k^{-1} + \vec(\ten{\mu}_y)\t{\vec(\ten{\mu}_y)}
 \end{align*}
+Furthermore, the explicit parameterization of the second moments
+\begin{displaymath}
+    \cov(\mat{t}(\ten{X})\mid Y = y) = \H b(\mat{\eta}_y)
+    = \begin{pmatrix}\mat{H}_{1,1} & \mat{H}_{1,2} \\ \mat{H}_{2,1} & \mat{H}_{2,2}\end{pmatrix}
+    % = \begin{pmatrix}\mat{H}_{1,1} & \mat{H}_{1,2} \\ \mat{H}_{2,1} & \mat{H}_{2,2}\end{pmatrix} = \cov(\mat{t}(\ten{X})\mid Y = y) = \begin{pmatrix}
+    %     \cov(\mat{t}_1(\ten{X})\mid Y = y) & \cov(\mat{t}_1(\ten{X}), \mat{t}_2(\ten{X})\mid Y = y) \\
+    %     \cov(\mat{t}_2(\ten{X}), \mat{t}_1(\ten{X})\mid Y = y) & \cov(\mat{t}_2(\ten{X})\mid Y = y)
+    % \end{pmatrix}
+\end{displaymath}
+are given by
+\begin{align*}
+    \mat{H}_{1,1} &= \cov(\vec{\ten{X}}\mid Y = y) = \bigotimes_{k = r}^1 \mat{\Omega}_k^{-1} = \bigotimes_{k = r}^1 \mat{\Delta}_k \\
+    \mat{H}_{2,1} = \t{\mat{H}_{1,2}} &= \cov(\vec{\ten{X}}\otimes \vec{\ten{X}}, \vec{\ten{X}}\mid Y = y) \\
+        &= 
+\end{align*}
+
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{Ising Model}
@@ -417,7 +433,7 @@ where each individual block is given by
 For example $\mathcal{J}_{1,2} = -\frac{\partial l(\Theta)}{\partial\t{(\vec{\overline{\ten{\eta}}_1})}\partial(\vec{\mat{\alpha}_1})}$ and $\mathcal{J}_{2r + 1, 2r + 1} = -\H l(\mat{\Omega}_r)$.
 We start by restating the log-likelihood for a given single observation $(\ten{X}, \ten{Y})$ where $\ten{F}_y$ given by
 \begin{displaymath}
-    l(\mat{\Theta}) = \log h(\ten{X}) + c_1\langle\overline{\ten{\eta}}_1 + \ten{F}_{y}\times_{k\in[r]}\mat{\alpha}_k, \ten{X} \rangle + c_2\langle\ten{X}\times_{k\in[r]}\mat{\Omega}_k, \ten{X} \rangle - b(\mat{\eta}_{y})
+    l(\mat{\Theta}) = \log h(\ten{X}) + c_1\big\langle\overline{\ten{\eta}}_1 + \ten{F}_{y}\mlm{k\in[r]}\mat{\alpha}_k, \ten{X}\big\rangle + c_2\big\langle\ten{X}\mlm{k\in[r]}\mat{\Omega}_k, \ten{X}\big\rangle - b(\mat{\eta}_{y})
 \end{displaymath}
 with
 \begin{align*}
@@ -505,7 +521,7 @@ The next step is to identify the Hessians from the second differentials in a sim
         &= -c_1 c_2 \t{\Big[ \t{(\ten{H}_{2,1})_{([2r])}} \vec\!\Big(\bigotimes_{k = r}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_j\otimes\bigotimes_{k=j-1}^{1}\mat{\Omega}_k\Big) \Big]} \vec{\d\overline{\ten{\eta}}_1} \\
         &= -c_1 c_2 \t{\vec( \ten{R}_{[2r]}(\ten{H}_{2,1}) \mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})}\times_j\t{(\vec{\d\mat{\Omega}_j})} )} \vec{\d\overline{\ten{\eta}}_1} \\
         &= -c_1 c_2 \t{(\vec{\d\mat{\Omega}_j})} ( \ten{R}_{[2r]}(\ten{H}_{2,1}) \mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})} )_{(j)} \vec{\d\overline{\ten{\eta}}_1} \\
-            &\qquad\Rightarrow \frac{\partial l}{\partial(\vec{\mat{\Omega}_j})\t{\partial(\vec{\overline{\ten{\eta}}_1)}}} = -c_1 c_2 ( \ten{R}_{[2r]}(\ten{H}_{2,1}) \mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})} )_{(j)}
+            &\qquad\Rightarrow \frac{\partial l}{\partial(\vec{\mat{\Omega}_j})\t{\partial(\vec{\overline{\ten{\eta}}_1)}}} = -c_1 c_2 \mat{D}_{p_j}\t{\mat{D}_{p_j}}( \ten{R}_{[2r]}(\ten{H}_{2,1}) \mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})} )_{(j)}
             \qquad{\color{gray} (p_j^2 \times p)}
         \\
     &\d^2 l(\mat{\alpha}_j) \\
@@ -541,15 +557,15 @@ The next step is to identify the Hessians from the second differentials in a sim
         &= -c_1 c_2 \t{(\vec{\d\mat{\alpha}_j})}\biggl(\t{\Big(\ten{F}_y\mlm{k\neq j}\mat{\alpha}_k\Big)_{(j)}}\otimes\mat{I}_{p_j}\biggr) \mat{K}_{\mat{p},(j)} \vec\Bigl(\ten{R}_{[2r]}(\ten{H}_{2,1})\mlm{k\neq l}\t{(\vec{\mat{\Omega}_k})}\times_l\t{(\vec{\d\mat{\Omega}_l})}\Bigr) \\
         &= -c_1 c_2 \t{(\vec{\d\mat{\alpha}_j})}\biggl(\t{\Big(\ten{F}_y\mlm{k\neq j}\mat{\alpha}_k\Big)_{(j)}}\otimes\mat{I}_{p_j}\biggr) \mat{K}_{\mat{p},(j)} \t{\Bigl(\ten{R}_{[2r]}(\ten{H}_{2,1})\mlm{k\neq l}\t{(\vec{\mat{\Omega}_k})}\Bigr)_{([r])}}\vec{\d\mat{\Omega}_l} \\
         &= -c_1 c_2 \t{(\vec{\d\mat{\alpha}_j})}\biggl( \Big(\ten{F}_y\mlm{k\neq j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j}^{[r]\backslash j + r} \Bigl(\ten{R}_{[2r]}(\ten{H}_{2,1})\mlm{k\neq l}\t{(\vec{\mat{\Omega}_k})}\Bigr) \biggr)_{(r + 2, 1)} \vec{\d\mat{\Omega}_l} \\
-            &\qquad\Rightarrow \frac{\partial l}{\partial(\vec{\mat{\alpha}_j})\t{\partial(\vec{\mat{\Omega}_l})}} = -c_1 c_2 \biggl( \Big(\ten{F}_y\mlm{k\neq j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j}^{[r]\backslash j + r} \Bigl(\ten{R}_{[2r]}(\ten{H}_{2,1})\mlm{k\neq l}\t{(\vec{\mat{\Omega}_k})}\Bigr) \biggr)_{(r + 2, 1)}
+            &\qquad\Rightarrow \frac{\partial l}{\partial(\vec{\mat{\alpha}_j})\t{\partial(\vec{\mat{\Omega}_l})}} = -c_1 c_2 \biggl( \Big(\ten{F}_y\mlm{k\neq j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j}^{[r]\backslash j + r} \Bigl(\ten{R}_{[2r]}(\ten{H}_{2,1})\mlm{k\neq l}\t{(\vec{\mat{\Omega}_k})}\Bigr) \biggr)_{(r + 2, 1)}\mat{D}_{p_l}\t{\mat{D}_{p_l}}
             % \qquad {\color{gray} (p_j q_j \times p_l^2)}
         \\
     &\d^2 l(\mat{\Omega}_j) \\
         &= -c_2^2 \t{\vec\Bigl(\bigkron{k = r}{l + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_l\otimes\bigkron{k=l-1}{1}\mat{\Omega}_k\Bigr)} \t{(\ten{H}_{2,2})_{([2r],[2r]+2r)}} \vec\Bigl(\bigkron{k = r}{l + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_l\otimes\bigkron{k=l-1}{1}\mat{\Omega}_k\Bigr) \\
         &= -c_2^2 \ten{R}_{[2r],[2r]+2r}(\ten{H}_{2,2})\mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})}\mlm{\substack{k + r\\k\in[r]\backslash j}}\t{(\vec{\mat{\Omega}_k})}\times_j\t{(\vec{\d\mat{\Omega}_j})}\times_{j + r}\t{(\vec{\d\mat{\Omega}_j})} \\
         &= -c_2^2 \t{(\vec{\d\mat{\Omega}_j})} \biggl( \ten{R}_{[2r],[2r]+2r}(\ten{H}_{2,2})\mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})}\mlm{\substack{k + r\\k\in[r]\backslash j}}\t{(\vec{\mat{\Omega}_k})} \biggr)_{([r])} \vec{\d\mat{\Omega}_j} \\
-            &\qquad\Rightarrow \H l(\mat{\Omega}_j) = -c_2^2 \biggl( \ten{R}_{[2r],[2r]+2r}(\ten{H}_{2,2})\mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})}\mlm{\substack{k + r\\k\in[r]\backslash j}}\t{(\vec{\mat{\Omega}_k})} \biggr)_{([r])}
-            \qquad {\color{gray} (p_j^2 \times p_j^2)}
+            &\qquad\Rightarrow \H l(\mat{\Omega}_j) = -c_2^2 \mat{D}_{p_j}\t{\mat{D}_{p_j}}\biggl( \ten{R}_{[2r],[2r]+2r}(\ten{H}_{2,2})\mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})}\mlm{\substack{k + r\\k\in[r]\backslash j}}\t{(\vec{\mat{\Omega}_k})} \biggr)_{([r])}\mat{D}_{p_j}\t{\mat{D}_{p_j}}
+            %\qquad {\color{gray} (p_j^2 \times p_j^2)}
         \\
     &\d^2 l(\mat{\Omega}_j, \mat{\Omega}_l) \\
         &\overset{\makebox[0pt]{\scriptsize $j < l$}}{=} c_2 \langle\ten{X}\times_{k\in[r]\backslash \{j, l\}}\mat{\Omega}_k\times_j\d\mat{\Omega}_j\times_l\d\mat{\Omega}_l, \ten{X}\rangle \\
@@ -562,8 +578,8 @@ The next step is to identify the Hessians from the second differentials in a sim
         &= c_2 \t{(\vec{\d\mat{\Omega}_j})}\Big((\ten{X}\otimes\ten{X} - \ten{R}_{[2r]}(\ten{D}_2)) \mlm{k\neq j,l}\t{(\vec{\mat{\Omega}_k})} \Big)_{(j)}\vec{\d\mat{\Omega}_l} \\
         &\qquad - c_2^2 \t{(\vec{\d\mat{\Omega}_j})}\Big(\ten{R}_{([2r],[2r]+2r)}(\ten{H}_{2,2}) \mlm{k\in [r]\backslash j}\t{(\vec{\mat{\Omega}_k})} \mlm{\substack{k + r \\ k\in [r]\backslash l}}\t{(\vec{\mat{\Omega}_k})}\Big)_{(j)}\vec{\d\mat{\Omega}_l} \\
             &\qquad \begin{aligned}\Rightarrow \frac{\partial l}{\partial(\vec{\mat{\Omega}_j})\t{\partial(\vec{\mat{\Omega}_l})}} &=
-                c_2 \Big((\ten{X}\otimes\ten{X} - \ten{R}_{[2r]}(\ten{D}_2)) \mlm{k\neq j,l}\t{(\vec{\mat{\Omega}_k})} \Big)_{(j)} \\
-                    &\qquad -c_2^2 \Big(\ten{R}_{([2r],[2r]+2r)}(\ten{H}_{2,2}) \mlm{k\in [r]\backslash j}\t{(\vec{\mat{\Omega}_k})} \mlm{\substack{k + r \\ k\in [r]\backslash l}}\t{(\vec{\mat{\Omega}_k})}\Big)_{(j)}
+                \mat{D}_{p_j}\t{\mat{D}_{p_j}}\Big[c_2\Big((\ten{X}\otimes\ten{X} - \ten{R}_{[2r]}(\ten{D}_2)) \mlm{k\neq j,l}\t{(\vec{\mat{\Omega}_k})} \Big)_{(j)} \\
+                    &\qquad -c_2^2 \Big(\ten{R}_{([2r],[2r]+2r)}(\ten{H}_{2,2}) \mlm{k\in [r]\backslash j}\t{(\vec{\mat{\Omega}_k})} \mlm{\substack{k + r \\ k\in [r]\backslash l}}\t{(\vec{\mat{\Omega}_k})}\Big)_{(j)}\Big]\mat{D}_{p_l}\t{\mat{D}_{p_l}}
         % \qquad {\color{gray} (p_j^2 \times p_l^2)}
         \end{aligned}
 \end{align*}}%
@@ -1122,11 +1138,62 @@ For two matrices $\mat A$ of dimensions $a_1\times a_2$ and $\mat B$ of dimensio
     \tr(\mat{A} \mat{B} \mat{C} \mat{D}) = \t{(\vec{\t{\mat{B}}})}(\t{\mat{A}}\otimes \mat{C})\vec{\mat{D}}
 \end{displaymath}
 
+Let $\ten{A}, \ten{B}$ be two tensors of order $r$, then
+\begin{displaymath}
+    \ten{A}\otimes\ten{B} = (\ten{A}\circ\ten{B})_{((2r - 1, 2r - 3, ..., 1), (2r, 2r - 2, ..., 2))}
+\end{displaymath}
+where $\circ$ is the outer product. For example considure two matrices $\mat{A}, \mat{B}$, the above simplifies to
+\begin{displaymath}
+    \mat{A}\otimes\mat{B} = (\mat{A}\circ\mat{B})_{((3, 1), (4, 2))}.
+\end{displaymath}
+
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
                           \section{Matrix Calculus}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+Let $f$ be an $r$ times differentiable function, then
+\begin{displaymath}
+    \d^r f(\mat{X}) = \ten{F}(\mat{X})\mlm{k = 1}{r} \vec{\d\mat{X}}
+    \qquad\Leftrightarrow\qquad
+    \D^r f(\mat{X}) \equiv \frac{1}{r!}\sum_{\sigma\in\perm{r}}\ten{F}(\mat{X})_{(\sigma)}
+\end{displaymath}
+where $\perm{r}$ denotes the set of all permutations of $[r]$.
 
+\begin{example}[Derivative with respect to a symmertic matrix]
+    Suppose we have a function $\mat{F}(\mat{X})$ where $\mat{X}$ is a symmetric $p\times p$ matrix. We want to find the derivative of $\mat{F}$ with respect to $\mat{X}$ under the symmetry constraint. As we know that for symmetric $\mat{X}$ holds $\vec{X} = \mat{D}_p\vech{X}$ we get in conjunction of $\mat{F}$ being only a function of $\vech{X}$ the differential as
+    \begin{displaymath}
+        \d\mat{F}(\mat{X})
+            = \d\mat{F}(\mat{D}_p\vech{\mat{X}})
+            = \D\mat{F}(\mat{X})\mat{D}_p\d\vech{\mat{X}}
+            = \D\mat{F}(\mat{X})\mat{D}_p\D(\vech{\mat{X}})(\mat{X})\vec{\d\mat{X}}.
+    \end{displaymath}
+    To find the value of $\D(\vech{\mat{X}})(\mat{X})$ we look at its components
+    \begin{displaymath}
+        (\D(\vech{\mat{X}})(\mat{X}))_{i, j} = \frac{(\partial \vech{X})_i}{(\partial \vec{X})_j}.
+    \end{displaymath}
+    Via an explicit indexing of the half vectorization and the vectorization operation in reference to the matrix indices of $\mat{X}$ we get with $i(k, l)\in[p(p + 1) / 2]$ and $j(m, n)\in[p^2]$ where $1 \leq l\leq k \leq p$ and $m, n\in[p]$ the mapping
+    \begin{align*}
+        i(k, l) &= (k - 1)(p + 1) + 1 - \frac{k(k - 1)}{2} + l - k \\
+        j(m, n) &= (n - 1)p + n + 1
+    \end{align*}
+    that
+    \begin{displaymath}
+        \frac{(\partial \vech{X})_{i(k,l)}}{(\partial \vec{X})_{j(m,n)}} = \begin{cases}
+            1 & \text{iff }(k,l) = (m,n)\text{ or }(k,l) = (n,m) \\
+            0 & \text{else}
+        \end{cases}.
+    \end{displaymath}
+    This reveals already its explicit value as the transposed dublication matrix $\mat{D}_p$ matches the same pattern, meaning
+    \begin{displaymath}
+        \D(\vech{\mat{X}})(\mat{X}) = \t{\mat{D}_p}.
+    \end{displaymath}
+    This means that from the differential $\d\mat{F}(\mat{Y}) = \mat{G}(\mat{Y})\vec{\d\mat{Y}}$ where $\mat{Y}$ is an arbitrary square matrix we can identify the derivative of $\mat{F}$ with respect to a symmetric $\mat{X}$ as
+    \begin{displaymath}
+        \d\mat{F}(\mat{Y}) = \mat{G}(\mat{Y})\vec{\d\mat{Y}}
+        \qquad\Rightarrow\qquad
+        \D\mat{F}(\mat{X}) = \mat{G}(\mat{X})\mat{D}_p\t{\mat{D}_p}\text{ for }\mat{X} = \t{\mat{X}}.
+    \end{displaymath}
+\end{example}
 
 \begin{example}
     We want to find the derivative with respect to any of the $r$ symmetric $p_j\times p_j$ matrices $\mat{\Omega}_j$ where $j = 1, ..., r$ of the Kronecker product
@@ -1296,5 +1363,47 @@ When sampling from the Multi-Array Normal one way is to sample from the Multi-Va
 \end{displaymath}
 where the sampling from the standard Multi-Array Normal is done by sampling all of the elements of $\ten{Z}$ from a standard Normal.
 
+\paragraph{Moments of the Multivariate Normal (in matrix notation):}
+The moment generating function of the multivariate normal distribution with parameters $\mu, \Sigma$ is
+\begin{displaymath}
+    M(t) = \E e^{\t{t}X} = \exp(\t{t}\mu + \tfrac{1}{2}\t{t}\Sigma t).
+\end{displaymath}
+The differentials up to the 4'th are
+\begin{align*}
+    \d   M(t) &= M(t) \t{(\mu + \Sigma t)} \d{t} \\
+    \d^2 M(t) &= \t{\d{t}} M(t) (\mu + \Sigma t)\t{(\mu + \Sigma t)} \d{t} \\
+    \d^3 M(t) &= M(t) (\mu + \Sigma t)\circ [(\mu + \Sigma t)\circ (\mu + \Sigma t) + 3\Sigma]\mlm{k = 1}{3} \d{t} \\
+    \d^4 M(t) &= M(t) (\mu + \Sigma t)\circ(\mu + \Sigma t)\circ[(\mu + \Sigma t)\circ(\mu + \Sigma t) + 6\Sigma)]\mlm{k = 1}{4} \d{t}
+\end{align*}
+Using the differentials to derivative identification identity
+\begin{displaymath}
+    \d^m f(t) = \ten{F}(t)\mlm{k = 1}{m}\d{t}
+    \qquad\Leftrightarrow\qquad
+    \D^m f(t) \equiv \frac{1}{m!}\sum_{\sigma\in\mathfrak{S}_m}\ten{F}(t)_{(\sigma)}
+\end{displaymath}
+in conjunction with simplifications gives the first four raw moments by evaluating at zero;
+\begin{align*}
+    M_1 = \D   M(t)|_{t = 0} &= \mu \\
+    M_2 = \D^2 M(t)|_{t = 0} &= \mu\t{\mu} + \Sigma \\
+    M_3 = \D^3 M(t)|_{t = 0} &= \mu\circ\mu\circ\mu + \mu\circ\Sigma + (\mu\circ\Sigma)_{((2), (1), (3))} + \Sigma\circ\mu \\
+    M_4 = \D^4 M(t)|_{t = 0} &\equiv \frac{1}{4!}\sum_{\sigma\in\mathfrak{S}_4} (\mu\circ\mu\circ\Sigma + \Sigma\circ\Sigma + \Sigma\circ\mu\circ\mu)_{(\sigma)}
+\end{align*}
+which leads to the centered moments (which are also the covariances of the sufficient statistic $t(X)$)
+\begin{align*}
+    H_{1,1} &= \cov(t_1(X)\mid Y = y) \\
+        &= \Sigma \\
+    H_{1,2} &= \cov(t_1(X), t_2(X)\mid Y = y) \\
+        &= (\mu\circ\Sigma)_{(3, (1, 2))} + (\mu\circ\Sigma)_{(3, (2, 1))} \\
+        &= \t{\mu}\otimes\Sigma + \Sigma\otimes\t{\mu} \\
+    H_{2,1} &= \cov(t_2(X), t_1(X)\mid Y = y) = \t{H_{1, 2}} \\
+    H_{2,2} &= \cov(t_2(X)\mid Y = y) \\
+        &=  (\mu\circ\mu\circ\Sigma + \Sigma\circ\Sigma + \Sigma\circ\mu\circ\mu)_{((1, 3), (2, 4))} +
+            (\mu\circ\mu\circ\Sigma + \Sigma\circ\Sigma + \Sigma\circ\mu\circ\mu)_{((1, 3), (4, 2))} \\
+            &\overset{???}{=} 2\Sigma\otimes\Sigma
+                + \mu\otimes\Sigma\otimes\t{\mu}
+                + \t{\mu}\otimes\Sigma\otimes\mu
+                + \mu\t{\mu}\otimes\Sigma
+                + \Sigma\otimes\mu\t{\mu}
+\end{align*}
 
 \end{document}
diff --git a/LaTeX/bernoulli.tex b/LaTeX/bernoulli.tex
deleted file mode 100644
index 6be5e56..0000000
--- a/LaTeX/bernoulli.tex
+++ /dev/null
@@ -1,656 +0,0 @@
-\documentclass[a4paper, 10pt]{article}
-
-\usepackage[utf8]{inputenc}
-\usepackage[T1]{fontenc}
-\usepackage{fullpage}
-\usepackage{amsmath, amssymb, amstext, amsthm}
-\usepackage{bm}                         % \boldsymbol and italic corrections, ...
-\usepackage[pdftex]{hyperref}
-\usepackage{makeidx}                    % Index (Symbols, Names, ...)
-\usepackage{xcolor, graphicx}           % colors and including images
-\usepackage{tikz}
-\usetikzlibrary{calc}
-\usepackage[
-    % backend=bibtex,
-    style=authoryear-comp
-]{biblatex}
-\usepackage{algorithm, algpseudocode}   % Pseudo Codes / Algorithms
-
-% Document meta into
-\title{Bernoulli}
-\author{Daniel Kapla}
-\date{\today}
-% Set PDF title, author and creator.
-\AtBeginDocument{
-    \hypersetup{
-        pdftitle = {Bernoulli},
-        pdfauthor = {Daniel Kapla},
-        pdfcreator = {\pdftexbanner}
-    }
-}
-
-\makeindex
-
-% Bibliography resource(s)
-\addbibresource{main.bib}
-
-% Setup environments
-% Theorem, Lemma
-\theoremstyle{plain}
-\newtheorem{theorem}{Theorem}
-\newtheorem{lemma}{Lemma}
-\newtheorem{example}{Example}
-% Definition
-\theoremstyle{definition}
-\newtheorem{defn}{Definition}
-% Remark
-\theoremstyle{remark}
-\newtheorem{remark}{Remark}
-
-% Define math macros
-\newcommand{\mat}[1]{\boldsymbol{#1}}
-\newcommand{\ten}[1]{\mathcal{#1}}
-\renewcommand{\vec}{\operatorname{vec}}
-\newcommand{\dist}{\operatorname{dist}}
-\newcommand{\rank}{\operatorname{rank}}
-\DeclareMathOperator{\kron}{\otimes}    % Kronecker Product
-\DeclareMathOperator{\hada}{\odot}      % Hadamard Product
-\newcommand{\ttm}[1][n]{\times_{#1}}    % n-mode product (Tensor Times Matrix)
-\DeclareMathOperator{\df}{df}
-\DeclareMathOperator{\tr}{tr}
-\DeclareMathOperator{\var}{Var}
-\DeclareMathOperator{\cov}{Cov}
-\DeclareMathOperator{\Span}{Span}
-\DeclareMathOperator{\E}{\operatorname{\mathbb{E}}}
-% \DeclareMathOperator{\independent}{{\bot\!\!\!\bot}}
-\DeclareMathOperator*{\argmin}{{arg\,min}}
-\DeclareMathOperator*{\argmax}{{arg\,max}}
-\newcommand{\D}{\textnormal{D}}         % derivative
-\renewcommand{\d}{\textnormal{d}}       % differential
-\renewcommand{\t}[1]{{#1^{\prime}}}     % matrix transpose
-\newcommand{\pinv}[1]{{#1^{\dagger}}}   % `Moore-Penrose pseudoinverse`
-\renewcommand{\vec}{\operatorname{vec}}
-\newcommand{\vech}{\operatorname{vech}}
-\newcommand{\logical}[1]{{[\![#1]\!]}}
-
-\begin{document}
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\section{Bivariate Bernoulli Distribution}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-A random 2-vector $X\in\{0, 1\}^2$ follows a \emph{Bivariate Bernoulli} distribution if its pmf is
-\begin{displaymath}
-    P(X = (x_1, x_2)) = p_{00}^{(1-x_1)(1-x_2)}p_{01}^{(1-x_1)x_2}p_{10}^{x_1(1-x_2)}p_{11}^{x_1x_2}
-\end{displaymath}
-where $p_{ab} = P(X = (a, b))$ for $a, b\in\{0, 1\}$. An alternative formulation, in terms of log-odds, follows immediately as
-\begin{displaymath}
-    P(X = (x_1, x_2)) = p_{00}\exp\Big(x_1\log\frac{p_{10}}{p_{00}} + x_2\log\frac{p_{01}}{p_{00}} + x_1x_2\log\frac{p_{00}p_{11}}{p_{01}p_{10}}\Big).
-\end{displaymath}
-Collecting the log-odds in a parameter vector $\mat{\theta} = \t{(\theta_{01}, \theta_{10}, \theta_{11})}$ where
-\begin{align*}
-    \theta_{01} &= \log\frac{p_{01}}{p_{00}}, \\
-    \theta_{10} &= \log\frac{p_{10}}{p_{00}}, \\
-    \theta_{11} &= \log\frac{p_{00}p_{11}}{p_{01}p_{10}}
-\end{align*}
-the pmf can be written more compact as
-\begin{displaymath}
-    P(X = (x_1, x_2)) = P(X = \mat{x}) = p_{00}\exp(\t{\mat{\theta}}\vech(\mat{x}\t{\mat{x}}))
-        = p_{00}\exp(\t{\mat{x}}\mat{\Theta}\mat{x})
-\end{displaymath}
-with the parameter matrix $\mat{\Theta}$ defined as
-\begin{displaymath}
-    \mat{\Theta} = \begin{pmatrix}
-        \theta_{01} & \tfrac{1}{2}\theta_{11} \\
-        \tfrac{1}{2}\theta_{11} & \theta_{10}
-    \end{pmatrix} = \begin{pmatrix}
-        \log\frac{p_{01}}{p_{00}} & \tfrac{1}{2}\log\frac{p_{00}p_{11}}{p_{01}p_{10}} \\
-        \tfrac{1}{2}\log\frac{p_{00}p_{11}}{p_{01}p_{10}} & \log\frac{p_{10}}{p_{00}}
-    \end{pmatrix}.
-\end{displaymath}
-The marginal distribution of $X_1$ and $X_2$ are given by
-\begin{align*}
-    P(X_1 = x_1) &= P(X = (x_1, 0)) + P(X = (x_1, 1)) \\
-        &= p_{00}^{1-x_1}p_{10}^{x_1} + p_{01}^{1-x_1}p_{11}^{x_1} \\
-        &= \begin{cases}
-            p_{00} + p_{01}, & x_1 = 0 \\
-            p_{01} + p_{11}, & x_1 = 1
-        \end{cases} \\
-        &= (p_{00} + p_{01})^{1-x_1}(p_{01} + p_{11})^{x_1}. \\
-    P(X_2 = x_2) &= (p_{00} + p_{10})^{1-x_2}(p_{10} + p_{11})^{x_2}.
-\end{align*}
-Furthermore, the conditional distributions are
-\begin{align*}
-    P(X_1 = x_1|X_2 = x_2) = \frac{P(X = (x_1, x_2))}{P(X_2 = x_2)}
-        \propto \big(p_{00}^{1-x_2}p_{01}^{x_2}\big)^{1-x_1}\big(p_{10}^{1-x_2}p_{11}^{x_2}\big)^{x_1}, \\
-    P(X_2 = x_2|X_1 = x_1) = \frac{P(X = (x_1, x_2))}{P(X_1 = x_1)}
-        \propto \big(p_{00}^{1-x_1}p_{10}^{x_1}\big)^{1-x_2}\big(p_{01}^{1-x_1}p_{11}^{x_1}\big)^{x_2}.
-\end{align*}
-Note that both the marginal and the conditional are again Bernoulli distributed. Its also of interest to look at the covariance between the components of $X$ which are given by
-\begin{displaymath}
-    \cov(X_1, X_2) = \E[(X_1 - \E X_1)(X_2 - \E X_2)] = p_{00}p_{11} - p_{01}p_{10}
-\end{displaymath}
-which follows by direct computation.
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\section{Multivariate Bernoulli Distribution}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-This is a direct generalization of the Bivariate Bernoulli Distribution. Before we start a few notes on notation. Let $a, b$ be binary vectors, then $\logical{a = b} = 1$ if and only if $\forall i : a_i = b_i$ and zero otherwise. With that, let $Y\in\{0, 1\}^q$ be a $q$-dimensional \emph{Multivariate Bernoulli} random variable with pdf
-\begin{equation}\label{eq:mvb_pmf}
-    P(Y = y) = \prod_{a\in\{0, 1\}^q} p_a^{\logical{y = a}} = p_y.
-\end{equation}
-The parameters are $2^q$ parameters $p_a$ which are indexed by the event $a\in\{0, 1\}^q$. The ``indexing'' is done by identifying an event $a\in\{0, 1\}^q$ with the corresponding binary number $m$ the event represents. In more detail we equate an event $a\in\{0, 1\}^q$ with a number $m\in[0; 2^q - 1]$ as
-\begin{equation}\label{eq:mvb_numbering}
-    m = m(a) = \sum_{i = 1}^q 2^{q - i}a_i
-\end{equation}
-which is a one-to-one relation. For example, for $q = 3$ all events are numbered as in Table~\ref{tab:event-to-number}.
-\begin{table}[!ht]
-    \centering
-    \begin{minipage}{0.8\textwidth}
-        \centering
-        \begin{tabular}{c|c}
-            Event $a$ & Number $m(a)$ \\ \hline
-            (0, 0, 0) & 0 \\
-            (0, 0, 1) & 1 \\
-            (0, 1, 0) & 2 \\
-            (0, 1, 1) & 3 \\
-            (1, 0, 0) & 4 \\
-            (1, 0, 1) & 5 \\
-            (1, 1, 0) & 6 \\
-            (1, 1, 1) & 7
-        \end{tabular}
-        \caption{\label{tab:event-to-number}\small Event numbering relation for $q = 3$. The events $a$ are all the possible elements of $\{0, 1\}^3$ and the numbers $m$ range from $0$ to $2^3 - 1 = 7$.}
-    \end{minipage}
-\end{table}
-
-\subsection{Exponential Family and Natural Parameters}
-The Multivariate Bernoulli is a member of the exponential family. This can be seen by rewriting the pmf \eqref{eq:mvb_pmf} in terms of an exponential family. First, we take a look at $\logical{y = a}$ for two binary vectors $y, a$ which can be written as
-\begin{align*}
-     \logical{y = a}
-        &= \prod_{i = 1}^q (y_i a_i + (1 - y_i)(1 - a_i))
-         = \prod_{i = 1}^q (y_i (2 a_i - 1) + (1 - a_i)) \\
-        &= \sum_{b\in\{0, 1\}^q}\prod_{i = 1}^q [y_i (2 a_i - 1)]^{b_i}(1 - a_i)^{1-b_i} \\
-\intertext{where the last equality follows by multiplying it out similar to the binomial theorem. Note that the inner product is zero if there is at least one $i$ such that $a_i = 1$ but $b_i = 0$, cause then $(1 - a_i)^{1-b_i} = 0$ and $1$ in all other cases. Therefore, using $\logical{a \leq b}$ gives}
-    ...
-        &= \sum_{b\in\{0, 1\}^q}\logical{a\leq b}\prod_{i = 1}^q [y_i (2 a_i - 1)]^{b_i}
-\intertext{Next, given $\logical{a \leq b}$ we get that $\prod_{i = 1}^q (2 a_i - 1)^{b_i} = (-1)^\logical{|b| \equiv_2 |a|}$ by counting the number of zeros in $a$ where at the same index $b$ is one. Cause $(2 a_i - 1)^{b_i} = -1$ for $a_i = 0$ and $b_i = 1$ and $1$ in every other case. This is encoded in $|b| \equiv_2 |a|$ as this is true if there are even number of $a_i = 0$ and $b_i = 1$ cases and false otherwise. This leads to the final version of the rewriting of $\logical{y = a}$ as
-}
-    ...
-        &= \sum_{b\in\{0, 1\}^q}\logical{a\leq b}(-1)^\logical{|b|\equiv_2|a|}\prod_{i = 1}^q y_i^{b_i}.
-\end{align*}
-Now, taking the log of \eqref{eq:mvb_pmf} and substituting $\logical{y = a}$ gives
-\begin{align*}
-    \log P(Y = y)
-        &= \sum_{a\in\{0, 1\}^q}\logical{y = a}\log p_a \\
-        &= \sum_{b\in\{0, 1\}^q}\sum_{a\in\{0, 1\}^q}\log(p_a)\logical{a\leq b}(-1)^\logical{|b|\equiv_2|a|}\prod_{i = 1}^q y_i^{b_i} \\
-        &= \sum_{b\in\{0, 1\}^q}\left(\prod_{i = 1}^q y_i^{b_i}\right)\sum_{a\leq b}\log(p_a)(-1)^\logical{|b|\equiv_2|a|} \\
-        &= \sum_{b\in\{0, 1\}^q}\left(\prod_{i = 1}^q y_i^{b_i}\right)\log\frac{\prod_{a\leq b, |a|\equiv_2|b|}p_a}{\prod_{a\leq b, |a|\not\equiv_2|b|}p_a}
-\end{align*}
-For each $b\in\{0, 1\}^q$ except for $b = (0, ..., 0)$ define
-\begin{displaymath}
-    \theta_{m(b)} = \theta_b = \log\frac{\prod_{a\leq b, |a|\equiv_2|b|}p_a}{\prod_{a\leq b, |a|\not\equiv_2|b|}p_a}, \qquad b\in\{0, 1\}^q\backslash\{0\}^q
-\end{displaymath}
-and collect the thetas in a combined vetor $\mat{\theta} = (\theta_1, \theta_2, ..., \theta_{2^q - 1})$ where we used the Bernoulli event to number identification of \eqref{eq:mvb_numbering}. The zero event is excluded here as casue its not needed. The reason therefore is that its already determined by all the other parameters and will be incorporated as the pmf scaling factor in the exponential family representation. Using the newly defined $\mat{\theta}$ we get
-\begin{align*}
-    P(Y = y) &= \exp\log P(Y = y)
-        = \exp\left(\sum_{b\in\{0, 1\}^q}\left(\prod_{i = 1}^q y_i^{b_i}\right)\log\frac{\prod_{a\leq b, |a|\equiv_2|b|}p_a}{\prod_{a\leq b, |a|\not\equiv_2|b|}p_a}\right) \\
-        &= p_0\exp\left(\sum_{b\in\{0, 1\}^q\backslash\{0\}^q}\left(\prod_{i = 1}^q y_i^{b_i}\right)\theta_b\right)
-\end{align*}
-The final step is to determin an representation of $p_0$ is terms of $\mat{\theta}$. But this follows simply by the fact that probabilities sum to $1$.
-\begin{align*}
-    1 &= \sum_{y\in\{0, 1\}^q}P(Y = y) = p_0\left(1 + \sum_{y\in\{0, 1\}^q\backslash\{0\}^q}\exp\left(\sum_{b\in\{0, 1\}^q\backslash\{0\}^q}\left(\prod_{i = 1}^q y_i^{b_i}\right)\theta_b\right)\right), \\
-    p_0(\mat{\theta}) &= \left(1 + \sum_{y\in\{0, 1\}^q\backslash\{0\}^q}\exp\left(\sum_{b\in\{0, 1\}^q\backslash\{0\}^q}\left(\prod_{i = 1}^q y_i^{b_i}\right)\theta_b\right)\right)^{-1}.
-\end{align*}
-This gives the pmf representation as
-\begin{equation}\label{eq:mvb_exp_fam}
-    P(Y = y) = p_0(\mat{\theta})\exp\left(\sum_{b\in\{0, 1\}^q\backslash\{0\}^q}\left(\prod_{i = 1}^q y_i^{b_i}\right)\theta_b\right)
-        = p_0(\mat{\theta})\exp(\t{T(y)}\mat{\theta})
-\end{equation}
-which proves the fact that the Multivariate Bernoulli is a member of the exponential family. Furthermore, the statistic $T(y)$ in \eqref{eq:mvb_exp_fam} is
-\begin{displaymath}
-    T(y) = \left(\prod_{i = 1}^q y_i^{b_i}\right)_{b\in\{0, 1\}^q\backslash\{0\}^q}
-\end{displaymath}
-which is a $2^q - 1$ dimensional binary vector.
-
-\subsection{Expectation and Covariance}
-First the expectation of a Multivariate Bernoulli $Y\sim\mathcal{B}_p$ is given by
-\begin{displaymath}
-    (\E Y)_j = (\E Y_j) = \sum_{\substack{y\in\{0, 1\}^q}} P(Y = y)y_j = \sum_{\substack{y\in\{0, 1\}^q\\y_j = 1}}P(Y = y)
-\end{displaymath}
-for each of the $j = 1, ..., q$ components of the $q$-dimensional random vector $Y$. Its covariance is similar given by
-\begin{displaymath}
-    \cov(Y_i, Y_j)
-        = \E(Y_i - \E Y_i)(Y_j - \E Y_j)
-        = \E Y_i Y_j - (\E Y_i)(\E Y_j)
-        = \sum_{\substack{y\in\{0, 1\}^q\\y_i = y_j = 1}}P(Y = y) - (\E Y_i)(\E Y_j).
-\end{displaymath}
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\section{The Ising Model}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-The Ising model is a special case of the Mutlivariate Bernoulli with pmf defined directly in its exponential family form by
-\begin{displaymath}
-    P_{\mat{\theta}}(Y = \mat{y}) = p_0(\mat{\theta})\exp\left(\sum_{i = 1}^q \theta_{\iota(i, i)}y_i + \sum_{i = 1}^{q - 1} \sum_{j = i + 1}^q \theta_{\iota(i, j)}y_i y_j\right).
-\end{displaymath}
-This ``constraint'' model only considures two way interactions with the natural parameters $\mat{\theta}$ of size $q(q + 1)/2$. The indexing function $\iota$ maps the vector indices to the corresponding parameter indices
-\begin{align*}
-    \iota(i, j) &= \iota_0(\min(i, j) - 1, \max(i, j) - 1) + 1, & i, j &= 1, ..., q \\
-\intertext{with the $0$-indexed mapping}
-    \iota_0(i, j) &= \frac{i (2 q + 1 - i)}{2} + (j - i) & i, j &= 0, ..., q - 1\text{ and }i\leq j.
-\end{align*}
-This index mapping is constructed such that the half vectorization of the outer product $\mat{y}\t{\mat{y}}$ corresponds in its singe and two way interactions between components to the appropriate parameter indices in $\mat{\theta}$. In other words, above pmf can be written as
-\begin{displaymath}
-    P_{\mat{\theta}}(Y = \mat{y}) = p_0(\mat{\theta})\exp( \t{\vech(\mat{y}\t{\mat{y}})}\mat{\theta} ).
-\end{displaymath}
-The scaling factor $p_0(\mat{\theta})$ (which is also the probability of the zero event, therefore the name) is
-\begin{displaymath}
-    p_0(\mat{\theta}) = \Big( \sum_{y\in\{0, 1\}^q} \exp( \t{\vech(\mat{y}\t{\mat{y}})}\mat{\theta} ) \Big)^{-1}.
-\end{displaymath}
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\subsection{Conditional Distribution}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-{\color{red} TODO: Fix this, its wrong!!!
-
-For the conditional distribution under the Ising model let $I\subsetneq[q]$ be a non-empty index set (non-empty cause this corresponds to no conditioning and not equal to avoid $P_{\mat{\theta}}(\emptyset|Y = \mat{y})$). Then denote with $\mat{y}_I$ the $|I|$ sub-vector of $\mat{y}$ consisting only of the indices in $I$ while $\mat{y}_{-I}$ is the $q - |I|$ vector with all indices \emph{not} in $I$.
-\begin{displaymath}
-    P_{\mat{\theta}}(Y_{I} = \mat{y}_{I} | Y_{-I} = \mat{y}_{-I})
-        = \frac{P_{\mat{\theta}}(Y = \mat{y})}{P_{\mat{\theta}}(Y_{-I} = \mat{y}_{-I})}
-        \propto (\mat{a}\mapsto P_{\mat{\theta}}(Y_{I} = \mat{a}, Y_{-I} = \mat{y}_{-I}))|_{\mat{a} = \mat{y}_I}
-\end{displaymath}
-now noting that
-\begin{displaymath}
-    \vech(\mat{y}\t{\mat{y}}) = \vech\Big(\mat{y}\t{\big(\mat{y} - \sum_{i\in I}y_i\mat{e}_i\big)}\Big)
-        + \vech\Big(\mat{y}\t{\big(\sum_{i\in I}y_i\mat{e}_i\big)}\Big)
-\end{displaymath}
-with the left summand depending only on $\mat{y}_{-I}$ due to the half vectorization only considureing the lower triangular part and the main diagonal. By substitution into the conditional probability we get
-\begin{align*}
-    P_{\mat{\theta}}(Y_{I} = \mat{y}_{I} | Y_{-I} = \mat{y}_{-I})
-        &\propto \exp( \t{\vech(\mat{y}\t{\mat{y}})}\mat{\theta} ) \\
-        &= \exp\Big( \t{\vech\Big(\mat{y}\t{\big(\mat{y} - \sum_{i\in I}y_i\mat{e}_i\big)}\Big)}\mat{\theta} \Big)
-            \exp\Big( \t{\vech\Big(\mat{y}\t{\big(\sum_{i\in I}y_i\mat{e}_i\big)}\Big)}\mat{\theta} \Big) \\
-        &\propto \exp\Big( \t{\vech\Big(\mat{y}\t{\big(\sum_{i\in I}y_i\mat{e}_i\big)}\Big)}\mat{\theta} \Big) \\
-        &= \prod_{i\in I}\exp( \t{\vech(\mat{y}\t{\mat{e}_i})}\mat{\theta} )^{y_i}
-\end{align*}
-leading to the scaled form
-\begin{displaymath}
-    P_{\mat{\theta}}(Y_{I} = \mat{y}_{I} | Y_{-I} = \mat{y}_{-I})
-        = p_0(\mat{\theta} | Y_{-I} = \mat{y}_{-I})\prod_{i\in I}\exp( \t{\vech(\mat{y}\t{\mat{e}_i})}\mat{\theta} )^{y_i}
-\end{displaymath}
-with
-\begin{displaymath}
-    p_0(\mat{\theta} | Y_{-I} = \mat{y}_{-I})
-        = P_{\mat{\theta}}(Y_{I} = \mat{0} | Y_{-I} = \mat{y}_{-I})
-        = \Big(\sum_{\substack{\mat{a}\in\{0, 1\}^q\\\mat{a}_{-I} = \mat{y}_{-I}}}
-        \prod_{i\in I}\exp( \t{\vech(\mat{a}\t{\mat{e}_i})}\mat{\theta} )^{a_i}\Big)^{-1}.
-\end{displaymath}
-
-} % end of TODO: Fix this, its wrong!!!
-
-Two special cases are of interest, first the single component case with $I = \{i\}$
-\begin{displaymath}
-    P_{\mat{\theta}}(Y_{i} = {y}_{i} | Y_{-i} = \mat{y}_{-i})
-        = \frac{\exp( \t{\vech(\mat{y}\t{\mat{e}_i})}\mat{\theta} )^{y_i}}{1 + \exp( \t{\vech(\mat{y}\t{\mat{e}_i})}\mat{\theta} )}
-\end{displaymath}
-and with $\mat{y}_{-i} = \mat{0}$ we get
-\begin{displaymath}
-    P_{\mat{\theta}}(Y_{i} = {y}_{i} | Y_{-i} = \mat{0})
-    = \frac{\exp( {\theta}_{\iota(i)} )^{y_i}}{1 + \exp( {\theta}_{\iota(i)} )}
-\end{displaymath}
-leading to
-\begin{displaymath}
-    \theta_{\iota(i)} = \log\frac{P_{\mat{\theta}}(Y_{i} = 1 | Y_{-i} = \mat{0})}{P_{\mat{\theta}}(Y_{i} = 0 | Y_{-i} = \mat{0})} = \log\frac{P_{\mat{\theta}}(Y_{i} = 1 | Y_{-i} = \mat{0})}{1 - P_{\mat{\theta}}(Y_{i} = 1 | Y_{-i} = \mat{0})}
-\end{displaymath}
-
-The second case considures the conditional distribution of two components given all the rest is zero, meaning that $I = \{i, j\}$ and $\mat{y}_{-i,-j} = \mat{0}$ which has the form
-\begin{align*}
-    P_{\mat{\theta}}(Y_{i} = {y}_{i}, Y_{j} = {y}_{j} | Y_{-i,-j} = \mat{0})
-        &= p_0(\mat{\theta} | Y_{-i,-j} = \mat{0})\exp( \t{\vech(\mat{y}\t{\mat{e}_i})}\mat{\theta} )^{y_i}
-            \exp( \t{\vech(\mat{y}\t{\mat{e}_j})}\mat{\theta} )^{y_j} \\
-        &= p_0(\mat{\theta} | Y_{-i,-j} = \mat{0})\exp(y_i\theta_{\iota(i)} + y_j\theta_{\iota(j)} + y_iy_j\theta_{\iota(i,j)})
-\end{align*}
-By setting the combinations of $y_i, y_j\in\{0, 1\}$ we get that
-\begin{align*}
-    \theta_{\iota(i, j)}
-        &= \log\frac{P_{\mat{\theta}}(Y_{i} = 0, Y_{j} = 0 | Y_{-i,-j} = \mat{0})P_{\mat{\theta}}(Y_{i} = 1, Y_{j} = 1 | Y_{-i,-j} = \mat{0})}{P_{\mat{\theta}}(Y_{i} = 0, Y_{j} = 1 | Y_{-i,-j} = \mat{0})P_{\mat{\theta}}(Y_{i} = 1, Y_{j} = 0 | Y_{-i,-j} = \mat{0})} \\
-        &= \log\frac{P_{\mat{\theta}}(Y_i = Y_j = 1 | Y_{-i,-j} = \mat{0})}{(1 - P_{\mat{\theta}}(Y_i = Y_j = 1 | Y_{-i,-j} = \mat{0}))}\frac{(1 - P_{\mat{\theta}}(Y_i = 1 | Y_{-i} = \mat{0})P_{\mat{\theta}}(Y_j = 1 | Y_{-j} = \mat{0}))}{P_{\mat{\theta}}(Y_i = 1 | Y_{-i} = \mat{0})P_{\mat{\theta}}(Y_j = 1 | Y_{-j} = \mat{0})}.
-\end{align*}
-Note that we have expressed all of the natural parameters $\mat{\theta}$ in terms conditional probabilities. Ether one component is $1$ and the rest is conditioned to be zero of two components are $1$ and the rest is conditional zero. This means that the set of those conditional probabilities is a sufficient statistic. Lets denote the vector of those as $\mat{\pi}$ with the same index mapping as used in $\theta$, more precise the the $q(q + 1) / 2$ dimensional vector $\mat{\pi}$ be defined component wise as
-\begin{align*}
-    {\pi}_{\iota(i)} = {\pi}(\mat{\theta})_{\iota(i)} &= P_{\mat{\theta}}(Y_i = 1 | Y_{-i} = \mat{0}) = \frac{\exp({\theta}_{\iota(i)})}{1 + \exp({\theta}_{\iota(i)})} \\
-    {\pi}_{\iota(i, j)} = {\pi}(\mat{\theta})_{\iota(i, j)} &= P_{\mat{\theta}}(Y_i = Y_j = 1 | Y_{-i, -j} = \mat{0}) \\
-        &= \frac{\exp(\theta_{\iota(i)} + \theta_{\iota(j)} + \theta_{\iota(i, j)})}{1 + \exp(\theta_{\iota(i)}) + \exp(\theta_{\iota(j)}) + \exp(\theta_{\iota(i)} + \theta_{\iota(j)} + \theta_{\iota(i, j)})}
-\end{align*}
-and the component wise inverse relation is given by
-\begin{equation}
-    \begin{aligned}[c]
-        \theta_{\iota(i)} = \theta(\mat{\pi})_{\iota(i)}
-            &= \log\frac{\pi_{\iota(i)}}{1 - \pi_{\iota(i)}} \\
-        \theta_{\iota(i, j)} = \theta(\mat{\pi})_{\iota(i, j)}
-            &= \log\frac{(1 - \pi_{\iota(i)}\pi_{\iota(j)})\pi_{\iota(i, j)}}{\pi_{\iota(i)}\pi_{\iota(j)}(1 - \pi_{\iota(i, j)})}.
-    \end{aligned}\label{eq:ising_theta_from_cond_prob}
-\end{equation}
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\subsection{Log-Likelihood, Score and Fisher Information}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-The log-likelihood for a given dataset $\mat{y}_i$, $n = 1, ..., n$ is
-\begin{displaymath}
-    l(\mat{\theta})
-        = \log\prod_{i = 1}^n P_{\mat{\theta}}(Y = \mat{y}_i)
-        = \log\prod_{i = 1}^n p_0(\mat{\theta})\exp( \t{\vech(\mat{y}_i\t{\mat{y}_i})}\mat{\theta} )
-        = n\log p_0(\mat{\theta}) + \sum_{i = 1}^n \t{\vech(\mat{y}_i\t{\mat{y}_i})}\mat{\theta}.
-\end{displaymath}
-For computing the Score we first get the differential
-\begin{align*}
-    \d l(\mat{\theta})
-        &= n p_0(\mat{\theta})^{-1} \d p_0(\mat{\theta}) + \sum_{i = 1}^n \t{\vech(\mat{y}_i\t{\mat{y}_i})}\d\mat{\theta} \\
-        &= -n p_0(\mat{\theta}) \sum_{y\in\{0, 1\}^q} \exp( \t{\vech(\mat{y}\t{\mat{y}})}\mat{\theta} )\t{\vech(\mat{y}\t{\mat{y}})}\d\mat{\theta} + \sum_{i = 1}^n \t{\vech(\mat{y}_i\t{\mat{y}_i})}\d\mat{\theta}
-\end{align*}
-leading to the Score
-\begin{align}
-    \nabla_{\mat{\theta}} l
-        &=  -n p_0(\mat{\theta}) \sum_{y\in\{0, 1\}^q} \exp( \t{\vech(\mat{y}\t{\mat{y}})}\mat{\theta} )\vech(\mat{y}\t{\mat{y}}) + \sum_{i = 1}^n \vech(\mat{y}_i\t{\mat{y}_i}) \nonumber \\
-        &= -n \E_{\mat{\theta}} \vech(Y\t{Y}) + \sum_{i = 1}^n \vech(\mat{y}_i\t{\mat{y}_i}). \label{eq:ising_score}
-\end{align}
-The second differential of the log-likelihood is
-\begin{multline*}
-    \d^2 l(\mat{\theta})
-        = n p_0(\mat{\theta})^2 \d\t{\mat{\theta}} \sum_{y\in\{0, 1\}^q} \exp( \t{\vech(\mat{y}\t{\mat{y}})}\mat{\theta} )\vech(\mat{y}\t{\mat{y}}) \sum_{y\in\{0, 1\}^q} \exp( \t{\vech(\mat{y}\t{\mat{y}})}\mat{\theta} )\t{\vech(\mat{y}\t{\mat{y}})}\d\mat{\theta} \\
-            - n p_0(\mat{\theta}) \sum_{y\in\{0, 1\}^q} \d\t{\mat{\theta}}\exp( \t{\vech(\mat{y}\t{\mat{y}})}\mat{\theta} )\vech(\mat{y}\t{\mat{y}})\t{\vech(\mat{y}\t{\mat{y}})}\d\mat{\theta} + 0
-\end{multline*}
-leading to the Hessian
-\begin{align*}
-    \nabla^2_{\mat{\theta}} l
-        &= n (\E_{\mat{\theta}}\vech(Y\t{Y}))\t{(\E_{\mat{\theta}}\vech(Y\t{Y}))} - n \E_{\mat{\theta}}\vech(Y\t{Y})\t{\vech(Y\t{Y})} \\
-        &= -n \cov_{\mat{\theta}}(\vech(Y\t{Y}), \vech(Y\t{Y})).
-\end{align*}
-From this the Fisher Information is directly given as
-\begin{displaymath}
-    \mathcal{I}(\mat{\theta}) = -\E_{\mat{\theta}} \nabla^2_{\mat{\theta}} l
-        = n \cov_{\mat{\theta}}(\vech(Y\t{Y}), \vech(Y\t{Y})).
-\end{displaymath}
-
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\subsection{Estimation}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-For the estimation we use the Fisher scoring algorithm which is an iterative updating algorithm following the updating rule
-\begin{displaymath}
-    \mat{\theta}_{t+1} = \mat{\theta}_{t} + \mathcal{I}(\mat{\theta})^{-1}\nabla_{\mat{\theta}} l(\mat{\theta}).
-\end{displaymath}
-The base idea behing Fisher scoring is to considure a Taylor expantion of the score
-\begin{displaymath}
-    \nabla_{\mat{\theta}} l(\mat{\theta}^*) \approx \nabla_{\mat{\theta}}l(\mat{\theta}) + \nabla^2_{\mat{\theta}} l(\mat{\theta})(\mat{\theta}^* - \mat{\theta}).
-\end{displaymath}
-Setting $\mat{\theta}^*$ to the MLE estimate $\widehat{\mat{\theta}}$ we get with $\nabla_{\mat{\theta}} l(\widehat{\mat{\theta}}) = 0$ that
-\begin{align*}
-    0 &\approx \nabla_{\mat{\theta}}l(\mat{\theta}) + \nabla^2_{\mat{\theta}} l(\mat{\theta})(\widehat{\mat{\theta}} - \mat{\theta}) \\
-    \widehat{\mat{\theta}} &\approx \mat{\theta} - (\nabla^2_{\mat{\theta}} l(\mat{\theta}))^{-1}\nabla_{\mat{\theta}}l(\mat{\theta}).
-\end{align*}
-Now, replacing the observed information $\nabla^2_{\mat{\theta}} l(\mat{\theta})$ with the Fisher information $\mathcal{I}(\mat{\theta}) = -\E_{\mat{\theta}} \nabla^2_{\mat{\theta}} l(\mat{\theta})$ leads to
-\begin{displaymath}
-    \widehat{\mat{\theta}} \approx \mat{\theta} + \mathcal{I}(\mat{\theta})^{-1}\nabla_{\mat{\theta}}l(\mat{\theta})
-\end{displaymath}
-which is basically above updating rule.
-
-For an initial estimate $\mat{\theta}_0$ we can evaluate the Score \eqref{eq:ising_score} at the MLE estimate $\widehat{\mat{\theta}}$ leading to
-\begin{displaymath}
-    \E_{\widehat{\mat{\theta}}} \vech(Y\t{Y}) = \frac{1}{n}\sum_{i = 1}^n \vech(\mat{y}_i\t{\mat{y}_i}).
-\end{displaymath}
-With $\E_{\widehat{\mat{\theta}}} \vech(Y\t{Y})_{\iota(i, j)} = P_{\widehat{\mat{\theta}}}(Y_iY_j = 1)$ we may treat the marginal probabilites to be not that far of the conditional probabilities and set $\mat{\pi}_0 = \E_{\widehat{\mat{\theta}}} \vech(Y\t{Y})$ from which we can compute $\mat{\theta}_0 = \mat{\theta}(\mat{\pi}_0)$ as in \eqref{eq:ising_theta_from_cond_prob}.
-
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\section{The Ising Model with Covariates}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-Now assume additional covariates $X\in\mathbb{R}^p$ given to the Multivariate Bernoulli variable $Y\in\{0, 1\}^q$ under the Ising model. There relation is given by assuming that $Y$ follows the Ising model given $X$
-\begin{align*}
-    P_{\mat{\theta}_X}(Y = \mat{y} | X)
-        &= p_0(\mat{\theta}_X)\exp(\t{\vech(\mat{y} \t{\mat{y}})}\mat{\theta}_X) \\
-        &= p_0(\mat{\theta}_X)\exp(\t{\mat{y}}\mat{\Theta}_X\mat{y}).
-\end{align*}
-with $\mat{\Theta}$ beeing a $q\times q$ symmetric matrix such that $\t{\mat{y}}\mat{\Theta}\mat{y} = \t{\vech(\mat{y} \t{\mat{y}})}\mat{\theta}$ for any $\mat{y}$. The explicit relation is
-\begin{align*}
-    \mat{\Theta} &= \tfrac{1}{2}(\mat{1}_q\t{\mat{1}_q} + \mat{I}_q)\odot\vech^{-1}(\mat{\theta}), \\
-    \mat{\theta} &= \vech((2\mat{1}_q\t{\mat{1}_q} - \mat{I}_q)\odot\mat{\Theta}).
-\end{align*}
-Assuming for centered $\E X = 0$ (w.l.o.g. cause we can always replace $X$ with $X - \E X$) that the covariate dependent parameters $\mat{\Theta}_X$ relate to $X$ by
-\begin{displaymath}
-    \mat{\Theta}_X = \t{\mat{\alpha}}X\t{X}\mat{\alpha}
-\end{displaymath}
-for an unconstraint parameter matrix $\mat{\alpha}$ of dimensions $p\times q$ leads to the Ising model with covariates of the form
-\begin{displaymath}
-    P_{\mat{\alpha}}(Y = \mat{y} | X)
-        = p_0(\mat{\alpha}, X)\exp(\t{\mat{y}}\t{\mat{\alpha}}X\t{X}\mat{\alpha}\mat{y}).
-\end{displaymath}
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\subsection{Log-likelihood, Score and Fisher information}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-Give a single observation $(\mat{y}, \mat{x})$ the log-likelihood is
-\begin{displaymath}
-    l(\mat{\alpha}) = \log P_{\mat{\alpha}}(Y = \mat{y} \mid X = \mat{x})
-        = \log p_0(\mat{\alpha}, \mat{x}) + \t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y}.
-\end{displaymath}
-Before we write the differential of the log-likelihood we take a look at the following
-\begin{align*}
-    \d(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y})
-       &= \d\tr(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y}) \\
-       &= 2 \tr(\mat{y}\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\d\mat{\alpha}) \\
-       &= 2 \t{\vec(\mat{\alpha})}(\mat{y}\t{\mat{y}}\otimes\mat{x}\t{\mat{x}})\vec(\d\mat{\alpha}) \\
-    \d\log p_0(\mat{\alpha}, \mat{x})
-       &= -p_0(\mat{\alpha}, \mat{x})\sum_{y\in\{0, 1\}^q}\d\exp(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y}) \\
-       &= -2p_0(\mat{\alpha}, \mat{x})\sum_{y\in\{0, 1\}^q}\exp(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y})\tr(\mat{y}\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\d\mat{\alpha}) \\
-       &= -2 \tr(\E_{\mat{\alpha}}[Y\t{Y}\mid X = \mat{x}]\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\d\mat{\alpha}) \\
-       &= -2 \t{\vec(\mat{\alpha})}(\E_{\mat{\alpha}}[Y\t{Y}\mid X = \mat{x}]\otimes\mat{x}\t{\mat{x}})\vec(\d\mat{\alpha})
-\end{align*}
-Therefore, the differential of the log-likelihood is
-\begin{displaymath}
-    \d l(\mat{\alpha})
-        = 2 \tr((\mat{y}\t{\mat{y}} - \E_{\mat{\alpha}}[Y\t{Y}\mid X = \mat{x}])\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\d\mat{\alpha})
-\end{displaymath}
-or equivalently the Score
-\begin{align}
-    \nabla_{\mat{\alpha}}l
-        &= 2 \mat{x}\t{\mat{x}}\mat{\alpha}(\mat{y}\t{\mat{y}} - \E_{\mat{\alpha}}[Y\t{Y}\mid X = \mat{x}]) \nonumber \\
-        &= 2 \mat{x}\t{\mat{x}}\mat{\alpha}\vec^{-1}(\mat{D}_q(\vech(\mat{y}\t{\mat{y}}) - \E_{\mat{\theta}_{\mat{\alpha}}(\mat{x})}[\vech(Y\t{Y})\mid X=\mat{x}])) \label{eq:ising_cond_score} \\
-        &= 2 \mat{x}\t{\mat{x}}\mat{\alpha}\vec^{-1}(\mat{D}_q\nabla_{\mat{\theta}}l(\mat{\theta}_{\mat{\alpha}}(\mat{x}); \mat{y})) \nonumber
-\end{align}
-where $\nabla_{\mat{\theta}}l(\mat{\theta}_{\mat{\alpha}}(\mat{x}); \mat{y})$ is the Score \eqref{eq:ising_score} for a single observation and $\mat{D}_q$ is the dublication matrix.
-
-Now we continue with the second-order differential of the log-likelihood, therefore considure
-\begin{align*}
-    \d^2(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y})
-        &= 2 \tr(\mat{y}\t{\mat{y}}\t{(\d\mat{\alpha})}\mat{x}\t{\mat{x}}\d\mat{\alpha}) + 2\tr(\mat{y}\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\d^2\mat{\alpha}) \\
-        &= 2\t{\vec(\d\mat{\alpha})}(\mat{y}\t{\mat{y}}\otimes\mat{x}\t{\mat{x}})\vec(\d\mat{\alpha}) + 0.
-\end{align*}
-The next term is
-\begin{align*}
-    \d^2 \log p_0(\mat{\alpha}, \mat{x})
-        &= \d\Big( -2p_0(\mat{\alpha}, \mat{x})\sum_{y\in\{0, 1\}^q}\exp(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y})\tr(\mat{y}\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\d\mat{\alpha}) \Big) \\
-\intertext{To shorten the expressions let $A_{\mat{y}} = (\mat{y}\t{\mat{y}}\otimes \mat{x}\t{\mat{x}})\vec{\mat{\alpha}}$, then}
-    \ldots &= \d\Big( -2p_0(\mat{\alpha}, \mat{x})\sum_{y\in\{0, 1\}^q}\exp(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y})\t{A_{\mat{y}}}\vec(\d\mat{\alpha}) \Big) \\
-        &= -2(\d p_0(\mat{\alpha}, \mat{x}))\sum_{y\in\{0, 1\}^q}\exp(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y})\t{A_{\mat{y}}}\vec(\d\mat{\alpha}) \Big) \\
-        &\qquad -4 p_0(\mat{\alpha}, \mat{x})\sum_{y\in\{0, 1\}^q}\exp(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y})\t{\vec(\d\mat{\alpha})}A_{\mat{y}}\t{A_{\mat{y}}}\vec(\d\mat{\alpha}) \Big) \\
-        &\qquad\qquad -2 p_0(\mat{\alpha}, \mat{x})\sum_{y\in\{0, 1\}^q}\exp(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y})\t{\vec(\d\mat{\alpha})}(\mat{y}\t{\mat{y}}\otimes\mat{x}\t{\mat{x}})\vec(\d\mat{\alpha}) \Big) \\
-        &= 4\t{\vec(\d\mat{\alpha})} \E_{\mat{\alpha}}[A_{Y} \mid X = \mat{x}]\E_{\mat{\alpha}}[\t{A_{Y}} \mid X = \mat{x}] \vec(\d\mat{\alpha}) \\
-        &\qquad -4\t{\vec(\d\mat{\alpha})} \E_{\mat{\alpha}}[A_{Y}\t{A_{Y}} \mid X = \mat{x}] \vec(\d\mat{\alpha}) \\
-        &\qquad\qquad -2\t{\vec(\d\mat{\alpha})} \E_{\mat{\alpha}}[Y\t{Y}\otimes \mat{x}\t{\mat{x}} \mid X = \mat{x}] \vec(\d\mat{\alpha}) \\
-        &= -\t{\vec(\d\mat{\alpha})}( 4\cov_{\mat{\alpha}}(A_{Y} \mid X = \mat{x})
-            + 2 \E_{\mat{\alpha}}[Y\t{Y}\otimes \mat{x}\t{\mat{x}} \mid X = \mat{x}]) \vec(\d\mat{\alpha})
-\end{align*}
-Back substituting to get the second order differential of the log-likelihood yields
-\begin{displaymath}
-    \d^2 l(\mat{\alpha}) = \t{\vec(\d\mat{\alpha})}(
-        2(\mat{y}\t{\mat{y}} - \E_{\mat{\alpha}}[Y\t{Y} \mid X = \mat{x}])\otimes \mat{x}\t{\mat{x}}
-        - 4\cov_{\mat{\alpha}}(A_{Y} \mid X = \mat{x})
-    ) \vec(\d\mat{\alpha})
-\end{displaymath}
-leading to the Hessian of the log-likelihood
-\begin{displaymath}
-    \nabla^2_{\vec{\mat{\alpha}}} l
-        = 2(\mat{y}\t{\mat{y}} - \E_{\mat{\alpha}}[Y\t{Y} \mid X = \mat{x}])\otimes \mat{x}\t{\mat{x}}
-        - 4\cov_{\mat{\alpha}}((Y\t{Y}\otimes \mat{x}\t{\mat{x}})\vec{\mat{\alpha}} \mid X = \mat{x}).
-\end{displaymath}
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\subsection{Estimation}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-{\color{red}This needs to be figured out how to do this in a good way.}
-
-For the initial value we start by equating the Score \eqref{eq:ising_cond_score} to zero giving us
-\begin{displaymath}
-    \widehat{\E}_{\mat{\theta}_{\mat{\alpha}}(\mat{x})}[\vech(Y\t{Y})\mid X=\mat{x}] = \frac{1}{n}\sum_{i = 1}^n \vech(\mat{y}_i\t{\mat{y}_i})
-\end{displaymath}
-as an estimate of the marginal probabilities of the singe and two way interaction effects $\E_{\mat{\theta}_{\mat{\alpha}}(\mat{x})}[\vech(Y\t{Y})\mid X=\mat{x}]$. By assuming that the marginal probabilities are similar to the conditional probabilities $\mat{\pi}$ we simply equate them to get an initial estimate for the conditional probabilities
-\begin{displaymath}
-    \widehat{\mat{\pi}}_0 = \frac{1}{n}\sum_{i = 1}^n \vech(\mat{y}_i\t{\mat{y}_i}).
-\end{displaymath}
-Using the relation \eqref{eq:ising_theta_from_cond_prob} we compute an initial estimate for the natural parameters
-\begin{displaymath}
-    \widehat{\mat{\theta}}_0 = \widehat{\mat{\theta}}(\widehat{\mat{\pi}}_0)
-\end{displaymath}
-and convert it to the matrix version of the parameters
-\begin{displaymath}
-    \widehat{\mat{\Theta}}_0 = \tfrac{1}{2}(\mat{1}_q\t{\mat{1}_q} + \mat{I}_q)\odot \vech^{-1}(\widehat{\mat{\theta}}_0).
-\end{displaymath}
-Let $\widehat{\mat{\Sigma}} = \frac{1}{n}\sum_{i = 1}^n \mat{x}_i\t{\mat{x}_i}$ then we get
-\begin{displaymath}
-    \widehat{\mat{\Theta}}_0 = \t{\widehat{\mat{\alpha}}_0}\widehat{\mat{\Sigma}}\widehat{\mat{\alpha}}_0.
-\end{displaymath}
-Next we define $m = \min(p, q)$ and take an rank $m$ approximation of both $\widehat{\mat{\Theta}}_0$ and $\widehat{\mat{\Sigma}}_0$ via an SVD. These approximations have the form
-\begin{align*}
-    \widehat{\mat{\Theta}}_0 &\approx \mat{U}_{\widehat{\mat{\Theta}}_0} \mat{D}_{\widehat{\mat{\Theta}}_0} \t{\mat{U}_{\widehat{\mat{\Theta}}_0}} \\
-    \widehat{\mat{\Sigma}}_0 &\approx \mat{U}_{\widehat{\mat{\Sigma}}_0} \mat{D}_{\widehat{\mat{\Sigma}}_0} \t{\mat{U}_{\widehat{\mat{\Sigma}}_0}}
-\end{align*}
-where $\mat{U}_{\widehat{\mat{\Theta}}_0}$, $\mat{U}_{\widehat{\mat{\Sigma}}_0}$ are semi-orthogonal matrices of dimensions $q\times m$, $p\times m$, respectively. Both the diagonal matrices $\mat{D}_{\widehat{\mat{\Theta}}_0}$, $\mat{D}_{\widehat{\mat{\Theta}}_0}$ have dimensions $m\times m$. Substitution of the approximations into above $\widehat{\mat{\Theta}}_0$ to $\widehat{\mat{\alpha}}_0$ relation yields
-\begin{displaymath}
-    \mat{U}_{\widehat{\mat{\Theta}}_0} \mat{D}_{\widehat{\mat{\Theta}}_0} \t{\mat{U}_{\widehat{\mat{\Theta}}_0}} \approx \t{\widehat{\mat{\alpha}}_0}\mat{U}_{\widehat{\mat{\Sigma}}_0} \mat{D}_{\widehat{\mat{\Sigma}}_0} \t{\mat{U}_{\widehat{\mat{\Sigma}}_0}}\widehat{\mat{\alpha}}_0.
-\end{displaymath}
-Solving for $\widehat{\mat{\alpha}}_0$ leads to out initial value estimate
-\begin{displaymath}
-    \widehat{\mat{\alpha}}_0 = \mat{U}_{\widehat{\mat{\Sigma}}_0} \mat{D}_{\widehat{\mat{\Sigma}}_0}^{-1/2}\mat{D}_{\widehat{\mat{\Theta}}_0}^{1/2} \t{\mat{U}_{\widehat{\mat{\Theta}}_0}}.
-\end{displaymath}
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\newpage
-{\color{red}Some notes}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-With the conditional Fisher information $\mathcal{I}_{Y\mid X = \mat{x}}$ as
-\begin{displaymath}
-    \mathcal{I}_{Y\mid X = \mat{x}}(\vec\mat{\alpha})
-        = -\E_{\mat{\alpha}}[\nabla^2_{\vec{\mat{\alpha}}} l \mid X = \mat{x}]
-        = 4\cov_{\mat{\alpha}}((Y\t{Y}\otimes \mat{x}\t{\mat{x}})\vec{\mat{\alpha}} \mid X = \mat{x}).
-\end{displaymath}
-If we know that $X\sim f_X$ with a pdf (or pmf) $f_X$ for $X$ we get
-\begin{displaymath}
-    \mathcal{I}_{Y|X}(\vec\mat{\alpha}) = \int\mathcal{I}_{Y\mid X = \mat{x}}(\vec\mat{\alpha})f_X(\mat{x})\,\d\mat{x}
-        = 4\E_{X}\cov_{\mat{\alpha}}((Y\t{Y}\otimes X\t{X})\vec{\mat{\alpha}} \mid X)
-\end{displaymath}
-by
-\begin{displaymath}
-    f_{\theta}(Y, X) = f_{\theta}(Y\mid X)f_{\theta}(X)
-\end{displaymath}
-it follows that
-\begin{displaymath}
-    l_{X, Y}(\theta) = \log f_{\theta}(Y, X)
-        = \log f_{\theta}(Y\mid X) + \log f_{\theta}(X)
-        = l_{Y\mid X}(\theta) + l_{X}(\theta)
-\end{displaymath}
-% With the fisher information in general beeing defined (under any distribution) $\mathcal{I}(\theta) = \E_{\theta} \nabla l(\theta)$ with the classical Fisher information under the joint distribution of $X, Y$ to be
-% \begin{align*}
-%     \mathcal{I}_{X, Y}(\theta)
-%         &= \E_{X, Y}\nabla l_{X, Y}(\theta) \\
-%         &= \E_{X, Y}\nabla l_{Y\mid X}(\theta) + \E_{X, Y}\nabla l_{X}(\theta) \\
-%         &= \E_{X}\E_{Y\mid X}[\nabla l_{Y\mid X}(\theta)\mid X] + \E_{X, Y}\nabla l_{X}(\theta) \\
-%         &= \E_{X}\E_{Y\mid X}[\nabla l_{Y\mid X}(\theta)\mid X] + \E_{X}\E_{Y\mid X}[\nabla l_{X}(\theta)\mid X] \\
-%         &= \E_{X}\E_{Y\mid X}[\nabla l_{Y\mid X}(\theta)\mid X] + \E_{X}\nabla l_{X}(\theta) \\
-%         &= \mathcal{I}_{Y\mid X}(\theta) + \mathcal{I}_{X}(\theta)
-% \end{align*}
-
-% What happens if the know the conditional distribution $Y\mid X\sim f_{\theta}$ only, meaning we not know the distribution of $X$. Then the log-likelihood for a data set $(y_i, x_i)$ with $i = 1, ..., n$ observations has the form
-% \begin{displaymath}
-%     l_{Y\mid X}(\theta) = \log\prod_{i = 1}^n f_{\theta}(Y = y_i \mid X = x_i)
-%         = \sum_{i = 1}^n \log f_{\theta}(Y = y_i \mid X = x_i)
-% \end{displaymath}
-% leading to
-% \begin{displaymath}
-%     \nabla l_{Y\mid X}(\theta) = \sum_{i = 1}^n \nabla \log f_{\theta}(Y = y_i \mid X = x_i)
-% \end{displaymath}
-
-\appendix
-\section{Notes on Fisher Information}
-Let $X\sim f_{\theta}$ be a random variable following a parameterized pdf (of pmf) $f_{\theta}$ with parameter vector $\theta$. Its log-likelihood (on the population, it is itself a random variable) is then
-\begin{displaymath}
-    l(\theta) = \log f_{\theta}(X)
-\end{displaymath}
-and the Score is defined as the derivative of the log-likelihood
-\begin{displaymath}
-    \nabla l(\theta) = \nabla\log f_{\theta}(X).
-\end{displaymath}
-The expectation of the Score is
-\begin{displaymath}
-    \E \nabla l(\theta)
-        = \int \nabla f_{\theta}(x)\log f_{\theta}(x)\,\d x
-        = \int \nabla f_{\theta}(x)\,\d x
-        = \nabla \int f_{\theta}(x)\,\d x
-        = \nabla 1
-        = 0.
-\end{displaymath}
-The Fisher information is defined as follows which is identical to the covariance of the Score due to the zero expectation of the Score
-\begin{displaymath}
-    \mathcal{I}(\theta) = \E \nabla l(\theta)\t{\nabla l(\theta)}.
-\end{displaymath}
-
-Now assume we have two random variable $X, Y$ and a parameter vector $\theta$, then the joint distributed relates to the conditional and the marginal distribution by
-\begin{displaymath}
-    f_{\theta}(X, Y) = f_{\theta}(Y\mid X)f_{\theta}(X)
-\end{displaymath}
-leading to the log-likelihood
-\begin{displaymath}
-    l_{X, Y}(\theta) = \log f_{\theta}(X, Y) = \log f_{\theta}(Y\mid X) + \log f_{\theta}(X)
-        = l_{Y\mid X}(\theta) + l_{X}(\theta).
-\end{displaymath}
-The Score relates identical due to the linearity of differentiation
-\begin{displaymath}
-    \nabla l_{X, Y}(\theta)
-        = \nabla l_{Y\mid X}(\theta) + \nabla l_{X}(\theta)
-\end{displaymath}
-but for the Fisher Information its (due to a different argument) the same cause
-\begin{align*}
-    \mathcal{I}_{X, Y}(\theta)
-        &= \E_{X, Y}\nabla l_{X, Y}(\theta)\t{\nabla l_{X, Y}(\theta)} \\
-        &= \E_{X, Y}(\nabla l_{Y\mid X}(\theta) + \nabla l_{X}(\theta))\t{(\nabla l_{Y\mid X}(\theta) + \nabla l_{X}(\theta))} \\
-        &= \E_{X, Y}\nabla l_{Y\mid X}(\theta)\t{\nabla l_{Y\mid X}(\theta)}
-            + \E_{X, Y}\nabla l_{Y\mid X}(\theta)\t{\nabla l_{X}(\theta)} \\
-        &\qquad + \E_{X, Y}\nabla l_{X}(\theta)\t{\nabla l_{Y\mid X}(\theta)}
-                    + \E_{X, Y}\nabla l_{X}(\theta)\t{\nabla l_{X}(\theta)} \\
-        &= \E_{X, Y}\nabla l_{Y\mid X}(\theta)\t{\nabla l_{Y\mid X}(\theta)}
-            + \E_{X, Y}\nabla l_{X}(\theta)\t{\nabla l_{X}(\theta)}
-\end{align*}
-where the last equality is due to
-\begin{displaymath}
-    \E_{X, Y}\nabla l_{Y\mid X}(\theta)\t{\nabla l_{X}(\theta)}
-        = \E_{X}\E_{Y\mid X}[\nabla l_{Y\mid X}(\theta)\mid X]\t{\nabla l_{X}(\theta)}
-        = 0
-\end{displaymath}
-using the $\E_{Y\mid X}[\nabla l_{Y\mid X}(\theta)\mid X] = 0$ as the expectation of the Score. The second term is identical and therefore we get
-\begin{align*}
-    \mathcal{I}_{X, Y}(\theta)
-        &= \E_{X, Y}\nabla l_{Y\mid X}(\theta)\t{\nabla l_{Y\mid X}(\theta)}
-            + \E_{X, Y}\nabla l_{X}(\theta)\t{\nabla l_{X}(\theta)} \\
-        &= \E_{X}\E_{Y\mid X}[\nabla l_{Y\mid X}(\theta)\t{\nabla l_{Y\mid X}(\theta)} \mid X]
-            + \E_{X}\nabla l_{X}(\theta)\t{\nabla l_{X}(\theta)} \\
-        &= \mathcal{I}_{Y\mid X}(\theta) + \mathcal{I}_{X}(\theta).
-\end{align*}
-Note the conditional Fisher Information which has the form
-\begin{displaymath}
-    \mathcal{I}_{Y\mid X}(\theta)
-        = \E_{X}\E_{Y\mid X}[\nabla l_{Y\mid X}(\theta)\t{\nabla l_{Y\mid X}(\theta)} \mid X]
-        = \int \mathcal{I}_{Y\mid X = x}(\theta)f_X(x)\,\d x
-\end{displaymath}
-Furthermore, in the case that the distribution of $X$ does not depend on $\theta$, meaning $f_{\theta}(X) = f(X)$, then $\mathcal{I}_X(\theta) = 0$ and $\mathcal{I}_{X, Y}(\theta) = \mathcal{I}_{Y \mid X}(\theta)$.
-
-\end{document}
diff --git a/LaTeX/main.matrix.tex b/LaTeX/main.matrix.tex
deleted file mode 100644
index fa24d08..0000000
--- a/LaTeX/main.matrix.tex
+++ /dev/null
@@ -1,815 +0,0 @@
-\documentclass[a4paper, 10pt]{article}
-
-\usepackage[utf8]{inputenc}
-\usepackage[T1]{fontenc}
-\usepackage{fullpage}
-\usepackage{amsmath, amssymb, amstext, amsthm}
-\usepackage{bm}                         % \boldsymbol and italic corrections, ...
-\usepackage[pdftex]{hyperref}
-\usepackage{makeidx}                    % Index (Symbols, Names, ...)
-\usepackage{xcolor, graphicx}           % colors and including images
-\usepackage{tikz}
-\usepackage[
-    % backend=bibtex,
-    style=authoryear-comp
-]{biblatex}
-
-% Document meta into
-\title{Derivation of Gradient Descent Algorithm for K-PIR}
-\author{Daniel Kapla}
-\date{November 24, 2021}
-% Set PDF title, author and creator.
-\AtBeginDocument{
-    \hypersetup{
-        pdftitle = {Derivation of Gradient Descent Algorithm for K-PIR},
-        pdfauthor = {Daniel Kapla},
-        pdfcreator = {\pdftexbanner}
-    }
-}
-
-\makeindex
-
-% Bibliography resource(s)
-\addbibresource{main.bib}
-
-% Setup environments
-% Theorem, Lemma
-\theoremstyle{plain}
-\newtheorem{theorem}{Theorem}
-\newtheorem{lemma}{Lemma}
-\newtheorem{example}{Example}
-% Definition
-\theoremstyle{definition}
-\newtheorem{defn}{Definition}
-% Remark
-\theoremstyle{remark}
-\newtheorem{remark}{Remark}
-
-% Define math macros
-\newcommand{\mat}[1]{\boldsymbol{#1}}
-\newcommand{\ten}[1]{\mathcal{#1}}
-\renewcommand{\vec}{\operatorname{vec}}
-\newcommand{\dist}{\operatorname{dist}}
-\DeclareMathOperator{\kron}{\otimes}    % Kronecker Product
-\DeclareMathOperator{\hada}{\odot}      % Hadamard Product
-\newcommand{\ttm}[1][n]{\times_{#1}}    % n-mode product (Tensor Times Matrix)
-\DeclareMathOperator{\df}{\operatorname{df}}
-\DeclareMathOperator{\tr}{\operatorname{tr}}
-\DeclareMathOperator{\var}{Var}
-\DeclareMathOperator{\cov}{Cov}
-\DeclareMathOperator{\E}{\operatorname{\mathbb{E}}}
-% \DeclareMathOperator{\independent}{{\bot\!\!\!\bot}}
-\DeclareMathOperator*{\argmin}{{arg\,min}}
-\DeclareMathOperator*{\argmax}{{arg\,max}}
-\newcommand{\D}{\textnormal{D}}
-\renewcommand{\d}{\textnormal{d}}
-\renewcommand{\t}[1]{{#1^{\prime}}}
-\newcommand{\pinv}[1]{{#1^{\dagger}}} % `Moore-Penrose pseudoinverse`
-\newcommand{\todo}[1]{{\color{red}TODO: #1}}
-
-% \DeclareFontFamily{U}{mathx}{\hyphenchar\font45}
-% \DeclareFontShape{U}{mathx}{m}{n}{
-%       <5> <6> <7> <8> <9> <10>
-%       <10.95> <12> <14.4> <17.28> <20.74> <24.88>
-%       mathx10
-%       }{}
-% \DeclareSymbolFont{mathx}{U}{mathx}{m}{n}
-% \DeclareMathSymbol{\bigtimes}{1}{mathx}{"91}
-
-\begin{document}
-
-\maketitle
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%%                               Introduction                               %%%
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\section{Notation}
-We start with a brief summary of the used notation.
-
-\todo{write this}
-
-Let $\ten{A}$ be a multi-dimensional array of order (rank) $r$ with dimensions $p_1\times ... \times p_r$ and the matrices $\mat{B}_i$ of dimensions $q_i\times p_i$ for $i = 1, ..., r$, then
-\begin{displaymath}
-    \ten{A} \ttm[1] \mat{B}_1 \ttm[2] \ldots \ttm[r] \mat{B}_r
-        = \ten{A}\times\{ \mat{B}_1, ..., \mat{B}_r \}
-        = \ten{A}\times_{i\in[r]} \mat{B}_i
-        = (\ten{A}\times_{i\in[r]\backslash j} \mat{B}_i)\ttm[j]\mat{B}_j
-\end{displaymath}
-As an alternative example consider
-\begin{displaymath}
-    \ten{A}\times_2\mat{B}_2\times_3\mat{B}_3 = \ten{A}\times\{ \mat{I}, \mat{B}_2, \mat{B}_3 \} = \ten{A}\times_{i\in\{2, 3\}}\mat{B}_i
-\end{displaymath}
-Another example
-\begin{displaymath}
-    \mat{B}\mat{A}\t{\mat{C}} = \mat{A}\times_1\mat{B}\times_2\mat{C}
-        = \mat{A}\times\{\mat{B}, \mat{C}\}
-\end{displaymath}
-
-\begin{displaymath}
-    (\ten{A}\ttm[i]\mat{B})_{(i)} = \mat{B}\ten{A}_{(i)}
-\end{displaymath}
-
-\todo{continue}
-
-\section{Tensor Normal Distribution}
-Let $\ten{X}$ be a multi-dimensional array random variable of order (rank) $r$ with dimensions $p_1\times ... \times p_r$ written as
-\begin{displaymath}
-    \ten{X}\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r).
-\end{displaymath}
-Its density is given by
-\begin{displaymath}
-    f(\ten{X}) = \Big( \prod_{i = 1}^r \sqrt{(2\pi)^{p_i}|\mat{\Delta}_i|^{p_{-i}}} \Big)^{-1}
-        \exp\!\left( -\frac{1}{2}\langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle \right)
-\end{displaymath}
-where $p_{\lnot i} = \prod_{j \neq i}p_j$. This is equivalent to the vectorized $\vec\ten{X}$ following a Multi-Variate Normal distribution
-\begin{displaymath}
-    \vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1)
-\end{displaymath}
-with $p = \prod_{i = 1}^r p_i$.
-
-\begin{theorem}[Tensor Normal to Multi-Variate Normal equivalence]
-    For a multi-dimensional random variable $\ten{X}$ of order $r$ with dimensions $p_1\times ..., p_r$. Let $\ten{\mu}$ be the mean of the same order and dimensions as $\ten{X}$ and the mode covariance matrices $\mat{\Delta}_i$ of dimensions $p_i\times p_i$ for $i = 1, ..., n$. Then the tensor normal distribution is equivalent to the multi-variate normal distribution by the relation
-    \begin{displaymath}
-        \ten{X}\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r)
-            \qquad\Leftrightarrow\qquad
-        \vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes ...\otimes \mat{\Delta}_1)
-    \end{displaymath}
-    where $p = \prod_{i = 1}^r p_i$.
-\end{theorem}
-\begin{proof}
-    A straight forward way is to rewrite the Tensor Normal density as the density of a Multi-Variate Normal distribution depending on the vectorization of $\ten{X}$. First consider
-    \begin{align*}
-        \langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle
-            &= \t{\vec(\ten{X} - \mu)}\vec((\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\}) \\
-            &= \t{\vec(\ten{X} - \mu)}(\mat{\Delta}_r^{-1}\otimes ...\otimes\mat{\Delta}_1^{-1})\vec(\ten{X} - \mu) \\
-            &= \t{(\vec\ten{X} - \vec\mu)}(\mat{\Delta}_r\otimes ...\otimes\mat{\Delta}_1)^{-1}(\vec\ten{X} - \vec\mu).
-    \end{align*}
-    Next, using a property of the determinant of a Kronecker product $|\mat{\Delta}_1\otimes\mat{\Delta}_2| = |\mat{\Delta}_1|^{p_2}|\mat{\Delta}_2|^{p_1}$ yields
-    \begin{displaymath}
-        |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1|
-            = |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_2|^{p_1}|\mat{\Delta}_1|^{p_{\lnot 1}}
-    \end{displaymath}
-    where $p_{\lnot i} = \prod_{j \neq i}p_j$. By induction over $r$ the relation
-    \begin{displaymath}
-        |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1|
-            = \prod_{i = 1}^r |\mat{\Delta}_i|^{p_{\lnot i}}
-    \end{displaymath}
-    holds for arbitrary order $r$. Substituting into the Tensor Normal density leads to
-    \begin{align*}
-        f(\ten{X}) = \Big( (2\pi)^p |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1| \Big)^{-1/2}
-        \exp\!\left( -\frac{1}{2}\t{(\vec\ten{X} - \vec\mu)}(\mat{\Delta}_r\otimes ...\otimes\mat{\Delta}_1)^{-1}(\vec\ten{X} - \vec\mu) \right)
-    \end{align*}
-    which is the Multi-Variate Normal density of the $p$ dimensional vector $\vec\ten{X}$.
-\end{proof}
-
-When sampling from the Multi-Array Normal one way is to sample from the Multi-Variate Normal and then reshaping the result, but this is usually very inefficient because it requires to store the multi-variate covariance matrix which is very big. Instead, it is more efficient to sample $\ten{Z}$ as a tensor of the same shape as $\ten{X}$ with standard normal entries and then transform the $\ten{Z}$ to follow the Multi-Array Normal as follows
-\begin{displaymath}
-    \ten{Z}\sim\mathcal{TN}(0, \mat{I}_{p_1}, ..., \mat{I}_{p_r})
-        \quad\Rightarrow\quad
-    \ten{X} = \ten{Z}\times\{\mat{\Delta}_1^{1/2}, ..., \mat{\Delta}_r^{1/2}\} + \mu\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r).
-\end{displaymath}
-where the sampling from the standard Multi-Array Normal is done by sampling all of the elements of $\ten{Z}$ from a standard Normal.
-
-
-\section{Introduction}
-We assume the model
-\begin{displaymath}
-    \mat{X} = \mat{\mu} + \mat{\beta}\mat{f}_y \t{\mat{\alpha}} + \mat{\epsilon}
-\end{displaymath}
-where the dimensions of all the components are listed in Table~\ref{tab:dimensions}.
-and its vectorized form
-\begin{displaymath}
-    \vec\mat{X} = \vec\mat{\mu} + (\mat{\alpha}\kron\mat{\beta})\vec\mat{f}_y + \vec\mat{\epsilon}
-\end{displaymath}
-
-\begin{table}[!htp]
-    \centering
-    \begin{minipage}{0.8\textwidth}
-    \centering
-    \begin{tabular}{l l}
-        $\mat X, \mat\mu, \mat R, \mat\epsilon$     & $p\times q$ \\
-        $\mat{f}_y$                                 & $k\times r$ \\
-        $\mat\alpha$                                & $q\times r$ \\
-        $\mat\beta$                                 & $p\times k$ \\
-        $\mat\Delta$                                & $p q\times p q$ \\
-        $\mat\Delta_1$                              & $q\times q$ \\
-        $\mat\Delta_2$                              & $p\times p$ \\
-        $\mat{r}$                                   & $p q\times 1$ \\
-        \hline
-        $\ten{X}, \ten{R}$                          & $n\times p\times q$ \\
-        $\ten{F}$                                   & $n\times k\times r$ \\
-    \end{tabular}
-    \caption{\label{tab:dimensions}\small Summary listing of dimensions with the corresponding sample versions $\mat{X}_i, \mat{R}_i, \mat{r}_i, \mat{f}_{y_i}$ for $i = 1, ..., n$ as well as estimates $\widehat{\mat{\alpha}}, \widehat{\mat{\beta}}, \widehat{\mat\Delta}, \widehat{\mat\Delta}_1$ and $\widehat{\mat\Delta}_2$.}
-    \end{minipage}
-\end{table}
-
-The log-likelihood $l$ given $n$ i.i.d. observations assuming that $\mat{X}_i\mid(Y = y_i)$ is normal distributed as
-\begin{displaymath}
-    \vec\mat{X}_i \sim \mathcal{N}_{p q}(\vec\mat\mu + (\mat\alpha\kron\mat\beta)\vec\mat{f}_{y_i}, \Delta)
-\end{displaymath}
-Replacing all unknown by there estimates gives the (estimated) log-likelihood
-\begin{equation}\label{eq:log-likelihood-est}
-    \hat{l}(\mat\alpha, \mat\beta) = -\frac{n q p}{2}\log 2\pi - \frac{n}{2}\log|\widehat{\mat\Delta}| - \frac{1}{2}\sum_{i = 1}^n \t{\mat{r}_i}\widehat{\mat\Delta}^{-1}\mat{r}_i
-\end{equation}
-where the residuals are
-\begin{displaymath}
-    \mat{r}_i = \vec\mat{X}_i - \vec\overline{\mat{X}} - (\mat\alpha\kron\mat\beta)\vec{\mat f}_{y_i}\qquad (p q \times 1)
-\end{displaymath}
-and the MLE estimate assuming $\mat\alpha, \mat\beta$ known for the covariance matrix $\widehat{\mat\Delta}$ as solution to the score equations is
-\begin{equation}\label{eq:Delta}
-    \widehat{\mat\Delta} = \frac{1}{n}\sum_{i = 1}^n \mat{r}_i \t{\mat{r}_i} \qquad(p q \times p q).
-\end{equation}
-Note that the log-likelihood estimate $\hat{l}$ only depends on $\mat\alpha, \mat\beta$. Next, we compute the gradient for $\mat\alpha$ and $\mat\beta$ of $\hat{l}$ used to formulate a Gradient Descent base estimation algorithm for $\mat\alpha, \mat\beta$ as the previous algorithmic. The main reason is to enable an estimation for bigger dimensions of the $\mat\alpha, \mat\beta$ coefficients since the previous algorithm does \emph{not} solve the high run time problem for bigger dimensions.
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%%                                Derivative                                %%%
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\section{Derivative of the Log-Likelihood}
-Start with the general case of $\mat X_i|(Y_i = y_i)$ is multivariate normal distributed with the covariance $\mat\Delta$ being a $p q\times p q$ positive definite symmetric matrix \emph{without} an further assumptions. We have $i = 1, ..., n$ observations following
-\begin{displaymath}
-    \mat{r}_i = \vec(\mat X_i - \mat\mu - \mat\beta\mat{f}_{y_i}\t{\mat\alpha}) \sim \mathcal{N}_{p q}(\mat 0, \mat\Delta).
-\end{displaymath}
-The MLE estimates of $\mat\mu, \mat\Delta$ are
-\begin{displaymath}
-    \widehat{\mat\mu} = \overline{\mat X} = \frac{1}{n}\sum_{i = 1}^n \mat X_i {\color{gray}\qquad(p\times q)},
-    \qquad \widehat{\mat\Delta} = \frac{1}{n}\sum_{i = 1}^n \mat r_i\t{\mat r_i} {\color{gray}\qquad(p q\times p q)}.
-\end{displaymath}
-Substitution of the MLE estimates into the log-likelihood $l(\mat\mu, \mat\Delta, \mat\alpha, \mat\beta)$ gives the estimated log-likelihood $\hat{l}(\mat\alpha, \mat\beta)$ as
-\begin{displaymath}
-    \hat{l}(\mat\alpha, \mat\beta) = -\frac{n q p}{2}\log 2\pi - \frac{n}{2}\log|\widehat{\mat\Delta}| - \frac{1}{2}\sum_{i = 1}^n \t{\mat{r}_i}\widehat{\mat\Delta}^{-1}\mat{r}_i.
-\end{displaymath}
-We are interested in the gradients $\nabla_{\mat\alpha}\hat{l}(\mat\alpha, \mat\beta)$, $\nabla_{\mat\beta}\hat{l}(\mat\alpha, \mat\beta)$ of the estimated log-likelihood. Therefore, we consider the differential of $\hat{l}$.
-\begin{align}
-    \d\hat{l}(\mat\alpha, \mat\beta)
-        &= -\frac{n}{2}\log|\widehat{\mat{\Delta}}| - \frac{1}{2}\sum_{i = 1}^n \big(\t{(\d \mat{r}_i)}\widehat{\mat{\Delta}}^{-1} \mat{r}_i + \t{\mat{r}_i}(\d\widehat{\mat{\Delta}}^{-1}) \mat{r}_i + \t{\mat{r}_i}\widehat{\mat{\Delta}}^{-1} \d \mat{r}_i\big) \nonumber\\
-        &= \underbrace{-\frac{n}{2}\log|\widehat{\mat{\Delta}}| - \frac{1}{2}\sum_{i = 1}^n \t{\mat{r}_i}(\d\widehat{\mat{\Delta}}^{-1}) \mat{r}_i}_{=\,0\text{ due to }\widehat{\mat{\Delta}}\text{ beeing the MLE}} \label{eq:deriv1}
-            - \sum_{i = 1}^n \t{\mat{r}_i}\widehat{\mat{\Delta}}^{-1} \d \mat{r}_i.
-\end{align}
-The next step is to compute $\d \mat{r}_i$ which depends on both $\mat\alpha$ and $\mat\beta$
-\begin{align*}
-    \d\mat{r}_i(\mat\alpha, \mat\beta)
-        &= -\d(\mat\alpha\kron \mat\beta)\vec\mat{f}_{y_i} \\
-        &= -\vec\!\big( \mat{I}_{p q}\,\d(\mat\alpha\kron \mat\beta)\vec\mat{f}_{y_i} \big) \\
-        &= -(\t{\vec(\mat{f}_{y_i})}\kron \mat{I}_{p q})\,\d\vec(\mat\alpha\kron \mat\beta) \\
-\intertext{using the identity \ref{eq:vecKron}, to obtain vectorized differentials, gives}
-    \dots
-        &= -(\t{\vec(\mat{f}_{y_i})}\kron \mat{I}_{p q})(\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) \,\d(\vec \mat\alpha\kron \vec \mat\beta) \\
-        &= -(\t{\vec(\mat{f}_{y_i})}\kron \mat{I}_{p q})(\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) \big((\d\vec \mat\alpha)\kron \vec \mat\beta + \vec \mat\alpha\kron (\d\vec \mat\beta)\big) \\
-        &= -(\t{\vec(\mat{f}_{y_i})}\kron \mat{I}_{p q})(\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) \big(\mat{I}_{r q}(\d\vec \mat\alpha)\kron (\vec \mat\beta)\mat{I}_1 + (\vec \mat\alpha)\mat{I}_1\kron \mat{I}_{k p}(\d\vec \mat\beta)\big) \\
-        &= -(\t{\vec(\mat{f}_{y_i})}\kron \mat{I}_{p q})(\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) \big((\mat{I}_{r q}\kron\vec \mat\beta)\d\vec \mat\alpha + (\vec \mat\alpha\kron \mat{I}_{k p})\d\vec \mat\beta\big)
-\end{align*}
-Now, substitution of $\d\mat{r}_i$ into \eqref{eq:deriv1} gives the gradients (not dimension standardized versions of $\D\hat{l}(\mat\alpha)$, $\D\hat{l}(\mat\beta)$) by identification of the derivatives from the differentials (see: \todo{appendix})
-\begin{align*}
-    \nabla_{\mat\alpha}\hat{l}(\mat\alpha, \mat\beta) &=
-        \sum_{i = 1}^n (\t{\vec(\mat{f}_{y_i})}\kron\t{\mat{r}_i}\widehat{\mat\Delta}^{-1}) (\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) (\mat{I}_{r q}\kron\vec \mat\beta),
-        {\color{gray}\qquad(q\times r)} \\
-    \nabla_{\mat\beta}\hat{l}(\mat\alpha, \mat\beta)  &=
-        \sum_{i = 1}^n (\t{\vec(\mat{f}_{y_i})}\kron\t{\mat{r}_i}\widehat{\mat\Delta}^{-1}) (\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) (\vec \mat\alpha\kron \mat{I}_{k p}).
-        {\color{gray}\qquad(p\times k)}
-\end{align*}
-These quantities are very verbose as well as completely unusable for an implementation. By detailed analysis of the gradients we see that the main parts are only element permutations with a high sparsity. By defining the following compact matrix
-\begin{equation}\label{eq:permTransResponse}
-    \mat G = \vec^{-1}_{q r}\bigg(\Big( \sum_{i = 1}^n \vec\mat{f}_{y_i}\otimes \widehat{\mat\Delta}^{-1}\mat{r}_i \Big)_{\pi(i)}\bigg)_{i = 1}^{p q k r}{\color{gray}\qquad(q r \times p k)}
-\end{equation}
-with $\pi$ being a permutation of $p q k r$ elements corresponding to permuting the axis of a 4D tensor of dimensions $p\times q\times k\times r$ by $(2, 4, 1, 3)$. As a generalization of transposition this leads to a rearrangement of the elements corresponding to the permuted 4D tensor with dimensions $q\times r\times p\times k$ which is then vectorized and reshaped into a matrix of dimensions $q r \times p k$. With $\mat G$ the gradients simplify to \todo{validate this mathematically}
-\begin{align*}
-    \nabla_{\mat\alpha}\hat{l}(\mat\alpha, \mat\beta) &=
-        \vec_{q}^{-1}(\mat{G}\vec{\mat\beta}),
-        {\color{gray}\qquad(q\times r)} \\
-    \nabla_{\mat\beta}\hat{l}(\mat\alpha, \mat\beta)  &=
-        \vec_{p}^{-1}(\t{\mat{G}}\vec{\mat\alpha}).
-        {\color{gray}\qquad(p\times k)}
-\end{align*}
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%%                      Kronecker Covariance Structure                      %%%
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\section{Kronecker Covariance Structure}
-Now we assume the residuals covariance has the form $\mat\Delta = \mat\Delta_1\otimes\mat\Delta_2$ where $\mat\Delta_1$, $\mat\Delta_2$ are $q\times q$, $p\times p$ covariance matrices, respectively. This is analog to the case that $\mat{R}_i$'s are i.i.d. Matrix Normal distribution
-\begin{displaymath}
-    \mat{R}_i = \mat{X}_i - \mat\mu - \mat\beta\mat{f}_{y_i}\t{\mat\alpha} \sim \mathcal{MN}_{p\times q}(\mat 0, \mat\Delta_2, \mat\Delta_1).
-\end{displaymath}
-The density of the Matrix Normal (with mean zero) is equivalent to the vectorized quantities being multivariate normal distributed with Kronecker structured covariance
-\begin{align*}
-    f(\mat R)
-        &= \frac{1}{\sqrt{(2\pi)^{p q}|\mat\Delta|}}\exp\left(-\frac{1}{2}\t{\vec(\mat{R})} \mat\Delta^{-1}\vec(\mat{R})\right) \\
-        &= \frac{1}{(2\pi)^{p q / 2}|\mat\Delta_1|^{p / 2}|\mat\Delta_2|^{q / 2}}\exp\left(-\frac{1}{2}\tr(\mat\Delta_1^{-1}\t{\mat{R}}\mat\Delta_2^{-1}\mat{R})\right)
-\end{align*}
-which leads for given data to the log-likelihood
-\begin{displaymath}
-    l(\mat{\mu}, \mat\Delta_1, \mat\Delta_2) =
-        -\frac{n p q}{2}\log 2\pi
-        -\frac{n p}{2}\log|\mat{\Delta}_1|
-        -\frac{n q}{2}\log|\mat{\Delta}_2|
-        -\frac{1}{2}\sum_{i = 1}^n \tr(\mat\Delta_1^{-1}\t{\mat{R}_i}\mat\Delta_2^{-1}\mat{R}_i).
-\end{displaymath}
-\subsection{MLE covariance estimates}
-Out first order of business is to derive the MLE estimated of the covariance matrices $\mat\Delta_1$, $\mat\Delta_2$ (the mean estimate $\widehat{\mat\mu}$ is trivial). Therefore, we look at the differentials with respect to changes in the covariance matrices as
-\begin{align*}
-    \d l(\mat\Delta_1, \mat\Delta_2) &=
-    -\frac{n p}{2}\d\log|\mat{\Delta}_1|
-    -\frac{n q}{2}\d\log|\mat{\Delta}_2|
-    -\frac{1}{2}\sum_{i = 1}^n
-        \tr(  (\d\mat\Delta_1^{-1})\t{\mat{R}_i}\mat\Delta_2^{-1}\mat{R}_i
-            + \mat\Delta_1^{-1}\t{\mat{R}_i}(\d\mat\Delta_2^{-1})\mat{R}_i) \\
-        &=
-    -\frac{n p}{2}\tr\mat{\Delta}_1^{-1}\d\mat{\Delta}_1
-    -\frac{n q}{2}\tr\mat{\Delta}_2^{-1}\d\mat{\Delta}_2 \\
-        &\qquad\qquad
-    +\frac{1}{2}\sum_{i = 1}^n
-        \tr(  \mat\Delta_1^{-1}(\d\mat\Delta_1)\mat\Delta_1^{-1}\t{\mat{R}_i}\mat\Delta_2^{-1}\mat{R}_i
-            + \mat\Delta_1^{-1}\t{\mat{R}_i}\mat\Delta_2^{-1}(\d\mat\Delta_2)\mat\Delta_2^{-1}\mat{R}_i) \\
-        &= \frac{1}{2}\tr\!\Big(\Big(
-            -n p \mat{I}_q + \mat\Delta_1^{-1}\sum_{i = 1}^n \t{\mat{R}_i}\mat\Delta_2^{-1}\mat{R}_i
-        \Big)\mat{\Delta}_1^{-1}\d\mat{\Delta}_1\Big) \\
-            &\qquad\qquad
-            + \frac{1}{2}\tr\!\Big(\Big(
-            -n q \mat{I}_p + \mat\Delta_2^{-1}\sum_{i = 1}^n \mat{R}_i\mat\Delta_1^{-1}\t{\mat{R}_i}
-        \Big)\mat{\Delta}_2^{-1}\d\mat{\Delta}_2\Big) \overset{!}{=} 0.
-\end{align*}
-Setting $\d l$ to zero yields the MLE estimates as
-\begin{displaymath}
-    \widehat{\mat{\mu}} = \overline{\mat X}{\color{gray}\quad(p\times q)}, \qquad
-    \widehat{\mat\Delta}_1 = \frac{1}{n p}\sum_{i = 1}^n \t{\mat{R}_i}\widehat{\mat\Delta}_2^{-1}\mat{R}_i{\color{gray}\quad(q\times q)}, \qquad
-    \widehat{\mat\Delta}_2 = \frac{1}{n q}\sum_{i = 1}^n \mat{R}_i\widehat{\mat\Delta}_1^{-1}\t{\mat{R}_i}{\color{gray}\quad(p\times p)}.
-\end{displaymath}
-Next, analog to above, we take the estimated log-likelihood and derive gradients with respect to $\mat{\alpha}$, $\mat{\beta}$.
-The estimated log-likelihood derives by replacing the unknown covariance matrices by there MLE estimates leading to
-\begin{displaymath}
-    \hat{l}(\mat\alpha, \mat\beta) =
-        -\frac{n p q}{2}\log 2\pi
-        -\frac{n p}{2}\log|\widehat{\mat{\Delta}}_1|
-        -\frac{n q}{2}\log|\widehat{\mat{\Delta}}_2|
-        -\frac{1}{2}\sum_{i = 1}^n \tr(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\mat{R}_i)
-\end{displaymath}
-and its differential
-\begin{displaymath}
-    \d\hat{l}(\mat\alpha, \mat\beta) =
-        -\frac{n p}{2}\d\log|\widehat{\mat{\Delta}}_1|
-        -\frac{n q}{2}\d\log|\widehat{\mat{\Delta}}_2|
-        -\frac{1}{2}\sum_{i = 1}^n \d\tr(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\mat{R}_i).
-\end{displaymath}
-We first take a closer look at the sum. After a bit of algebra using $\d\mat A^{-1} = -\mat A^{-1}(\d\mat A)\mat A^{-1}$ and the definitions of $\widehat{\mat\Delta}_1$, $\widehat{\mat\Delta}_2$ the sum can be rewritten
-\begin{displaymath}
-    \frac{1}{2}\sum_{i = 1}^n \d\tr(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\mat{R}_i)
-        = \sum_{i = 1}^n \tr(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\d\mat{R}_i)
-            - \frac{np}{2}\d\log|\widehat{\mat\Delta}_1|
-            - \frac{nq}{2}\d\log|\widehat{\mat\Delta}_2|.
-\end{displaymath}
-This means that most of the derivative cancels out and we get
-\begin{align*}
-    \d\hat{l}(\mat\alpha, \mat\beta)
-        &= \sum_{i = 1}^n \tr(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\d\mat{R}_i) \\
-        &= \sum_{i = 1}^n \tr(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}((\d\mat\beta)\mat{f}_{y_i}\t{\mat\alpha} + \mat\beta\mat{f}_{y_i}\t{(\d\mat\alpha}))) \\
-        &= \sum_{i = 1}^n \t{\vec(\widehat{\mat{\Delta}}_2^{-1}\mat{R}_i\widehat{\mat{\Delta}}_1^{-1}\mat\alpha\t{\mat{f}_{y_i}})}\d\vec\mat\beta
-            + \sum_{i = 1}^n \t{\vec(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\mat\beta\mat{f}_{y_i})}\d\vec\mat\alpha
-\end{align*}
-which means the gradients are
-\begin{align*}
-    \nabla_{\mat\alpha}\hat{l}(\mat\alpha, \mat\beta)
-        &= \sum_{i = 1}^n \widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\mat\beta\mat{f}_{y_i}
-        = (\ten{R}\ttm[3]\widehat{\mat{\Delta}}_1^{-1}\ttm[2]\widehat{\mat{\Delta}}_2^{-1})_{(3)}\t{(\ten{F}\ttm[2]\mat\beta)_{(3)}}\\
-    \nabla_{\mat\beta}\hat{l}(\mat\alpha, \mat\beta)
-        &= \sum_{i = 1}^n \widehat{\mat{\Delta}}_2^{-1}\mat{R}_i\widehat{\mat{\Delta}}_1^{-1}\mat\alpha\t{\mat{f}_{y_i}}
-        = (\ten{R}\ttm[3]\widehat{\mat{\Delta}}_1^{-1}\ttm[2]\widehat{\mat{\Delta}}_2^{-1})_{(2)}\t{(\ten{F}\ttm[3]\mat\alpha)_{(2)}}
-\end{align*}
-
-\paragraph{Comparison to the general case:} There are two main differences, first the general case has a closed form solution for the gradient due to the explicit nature of the MLE estimate of $\widehat{\mat\Delta}$ compared to the mutually dependent MLE estimates $\widehat{\mat\Delta}_1$, $\widehat{\mat\Delta}_2$. On the other hand the general case has dramatically bigger dimensions of the covariance matrix ($p q \times p q$) compared to the two Kronecker components with dimensions $q \times q$ and $p \times p$. This means that in the general case there is a huge performance penalty in the dimensions of $\widehat{\mat\Delta}$ while in the other case an extra estimation is required to determine $\widehat{\mat\Delta}_1$, $\widehat{\mat\Delta}_2$.
-
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%%                     Alternative covariance estimates                     %%%
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\subsection{Alternative covariance estimates}
-An alternative approach is \emph{not} to use the MLE estimates for $\mat\Delta_1$, $\mat\Delta_2$ but (up to scaling) unbiased estimates.
-\begin{displaymath}
-    \widetilde{\mat\Delta}_1 = \frac{1}{n}\sum_{i = 1}^n \t{\mat{R}_i}\mat{R}_i {\color{gray}\quad(q\times q)},\qquad
-    \widetilde{\mat\Delta}_2 = \frac{1}{n}\sum_{i = 1}^n \mat{R}_i\t{\mat{R}_i} {\color{gray}\quad(p\times p)}.
-\end{displaymath}
-The unbiasednes comes directly from the following short computation;
-\begin{displaymath}
-    (\E\widetilde{\mat\Delta}_1)_{j,k} = \frac{1}{n}\sum_{i = 1}^n \sum_{l = 1}^p \E \mat{R}_{i,l,j}\mat{R}_{i,l,k}
-        = \frac{1}{n}\sum_{i = 1}^n \sum_{l = 1}^p (\mat{\Delta}_{2})_{l,l}(\mat{\Delta}_{1})_{j,k}
-        = (\mat\Delta_1\tr(\mat\Delta_2))_{j,k}.
-\end{displaymath}
-which means that $\E\widetilde{\mat\Delta}_1 = \mat\Delta_1\tr(\mat\Delta_2)$ and in analogy $\E\widetilde{\mat\Delta}_2 = \mat\Delta_2\tr(\mat\Delta_1)$. Now, we need to handle the scaling which can be estimated unbiasedly by
-\begin{displaymath}
-    \tilde{s} = \frac{1}{n}\sum_{i = 1}^n \|\mat{R}_i\|_F^2
-\end{displaymath}
-because with $\|\mat{R}_i\|_F^2 = \tr \mat{R}_i\t{\mat{R}_i} = \tr \t{\mat{R}_i}\mat{R}_i$ the scale estimate $\tilde{s} = \tr(\widetilde{\mat\Delta}_1) = \tr(\widetilde{\mat\Delta}_2)$. Then $\E\tilde{s} = \tr(\E\widetilde{\mat\Delta}_1) = \tr{\mat\Delta}_1 \tr{\mat\Delta}_2 = \tr({\mat\Delta}_1\otimes{\mat\Delta}_2)$. Leading to the estimate of the covariance as
-\begin{displaymath}
-    \widetilde{\mat\Delta} = \tilde{s}^{-1}(\widetilde{\mat{\Delta}}_1\otimes\widetilde{\mat{\Delta}}_2)
-\end{displaymath}
-
-\todo{ prove they are consistent, especially $\widetilde{\mat\Delta} = \tilde{s}^{-1}(\widetilde{\mat\Delta}_1\otimes\widetilde{\mat\Delta}_2)$!}
-
-The hoped for a benefit is that these covariance estimates are in a closed form which means there is no need for an additional iterative estimations step. Before we start with the derivation of the gradients define the following two quantities
-\begin{align*}
-    \mat{S}_1 = \frac{1}{n}\sum_{i = 1}^n \t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i = \frac{1}{n}\ten{R}_{(3)}\t{(\ten{R}\ttm[2]\widetilde{\mat{\Delta}}_2^{-1})_{(3)}}\quad{\color{gray}(q\times q)}, \\
-    \mat{S}_2 = \frac{1}{n}\sum_{i = 1}^n \mat{R}_i\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i} = \frac{1}{n}\ten{R}_{(2)}\t{(\ten{R}\ttm[3]\widetilde{\mat{\Delta}}_1^{-1})_{(2)}}\quad{\color{gray}(p\times p)}.
-\end{align*}
-\todo{Check tensor form!}
-
-Now, the matrix normal with the covariance matrix of the vectorized quantities of the form $\mat{\Delta} = s^{-1}(\mat{\Delta}_1\otimes\mat{\Delta}_2)$ has the form
-\begin{align*}
-    f(\mat R)
-        &= \frac{1}{\sqrt{(2\pi)^{p q}|\mat\Delta|}}\exp\left(-\frac{1}{2}\t{\vec(\mat{R})} \mat\Delta^{-1}\vec(\mat{R})\right) \\
-        &= \frac{s^{p q / 2}}{(2\pi)^{p q / 2}|\mat\Delta_1|^{p / 2}|\mat\Delta_2|^{q / 2}}\exp\left(-\frac{s}{2}\tr(\mat\Delta_1^{-1}\t{\mat{R}}\mat\Delta_2^{-1}\mat{R})\right)
-\end{align*}
-
-The approximated log-likelihood is then
-\begin{align*}
-    \tilde{l}(\mat\alpha, \mat\beta)
-        &=
-        -\frac{n p q}{2}\log{2\pi}
-        -\frac{n}{2}\log|\widetilde{\mat{\Delta}}|
-        -\frac{1}{2}\sum_{i = 1}^n \t{\mat{r}_i}\widetilde{\mat{\Delta}}^{-1}\mat{r}_i \\
-        &=
-        -\frac{n p q}{2}\log{2\pi}
-        +\frac{n p q}{2}\log\tilde{s}
-        -\frac{n p}{2}\log|\widetilde{\mat{\Delta}}_1|
-        -\frac{n q}{2}\log|\widetilde{\mat{\Delta}}_2|
-        -\frac{\tilde{s}}{2}\sum_{i = 1}^n \tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i).
-\end{align*}
-The second form is due to the property of the determinant for scaling and the Kronecker product giving that $|\widetilde{\mat\Delta}| = (\tilde{s}^{-1})^{p q}|\widetilde{\mat{\Delta}}_1|^p |\widetilde{\mat{\Delta}}_2|^q$ as well as an analog Kronecker decomposition as in the MLE case.
-
-Note that with the following holds
-\begin{displaymath}
-    \sum_{i = 1}^n \tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i)
-    = n \tr(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1)
-    = n \tr(\widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2)
-    = n \tr(\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1})
-    = n \tr(\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1}).
-\end{displaymath}
-
-The derivation of the Gradient of the approximated log-likelihood $\tilde{l}$ is tedious but straight forward. We tackle the summands separately;
-\begin{align*}
-    \d\log\tilde{s} &= \tilde{s}^{-1}\d\tilde{s} = \frac{2}{n\tilde{s}}\sum_{i = 1}^n \tr(\t{\mat{R}_i}\d\mat{R}_i)
-        = -\frac{2}{n\tilde{s}}\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\mat{R}_i\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\t{\mat{R}_i}\d\mat{\beta}), \\
-    \d\log|\widetilde{\mat{\Delta}}_1| &=\tr(\widetilde{\mat{\Delta}}_1^{-1}\d\widetilde{\mat{\Delta}}_1) = \frac{2}{n}\sum_{i = 1}^n \tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\d\mat{R}_i)
-        = -\frac{2}{n}\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\mat{R}_i\widetilde{\mat{\Delta}}_1^{-1}\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\d\mat{\beta}), \\
-    \d\log|\widetilde{\mat{\Delta}}_2| &=\tr(\widetilde{\mat{\Delta}}_2^{-1}\d\widetilde{\mat{\Delta}}_2) = \frac{2}{n}\sum_{i = 1}^n \tr(\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\d\mat{R}_i)
-        = -\frac{2}{n}\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\d\mat{\beta})
-\end{align*}
-as well as
-\begin{displaymath}
-    \d\,\tilde{s}\sum_{i = 1}^n \tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i)
-        = (\d\tilde{s})\sum_{i = 1}^n \tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i)
-            + \tilde{s}\, \d \sum_{i = 1}^n \tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i).
-\end{displaymath}
-We have
-\begin{displaymath}
-    \d\tilde{s} = -\frac{2}{n}\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\mat{R}_i\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\t{\mat{R}_i}\d\mat{\beta})
-\end{displaymath}
-and the remaining term
-\begin{align*}
-    \d\sum_{i = 1}^n\tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i)
-         = 2\sum_{i = 1}^n \tr(&\t{\mat{f}_{y_i}}\t{\mat{\beta }}(\mat{R}_i   \widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1}  +    \widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1} \mat{R}_i - \widetilde{\mat{\Delta}}_2^{-1}   \mat{R}_i \widetilde{\mat{\Delta}}_1^{-1})\d\mat{\alpha} \\
-         +\,&\mat{f}_{y_i} \t{\mat{\alpha}}(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i} + \t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1} - \widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1})\d\mat{\beta }).
-\end{align*}
-The last one is tedious but straight forward. Its computation extensively uses the symmetry of $\widetilde{\mat{\Delta}}_1$, $\widetilde{\mat{\Delta}}_2$, the cyclic property of the trace and the relation $\d\mat{A}^{-1} = -\mat{A}^{-1}(\d\mat{A})\mat{A}^{-1}$.
-
-Putting it all together
-\begin{align*}
-    \d\tilde{l}(\mat{\alpha}, \mat{\beta})
-       &= \frac{n p q}{2}\Big(-\frac{2}{n\tilde{s}}\Big)\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\mat{R}_i\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\t{\mat{R}_i}\d\mat{\beta}) \\
-       &\hspace{3em} - \frac{n p}{2}\Big(-\frac{2}{n}\Big)\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\mat{R}_i\widetilde{\mat{\Delta}}_1^{-1}\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\d\mat{\beta}) \\
-       &\hspace{3em} - \frac{n q}{2}\Big(-\frac{2}{n}\Big)\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\d\mat{\beta}) \\
-       &\hspace{3em} -\frac{1}{2}\Big(-\frac{2}{n}\Big)\Big(\sum_{i = 1}^n \tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i)\Big)\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\mat{R}_i\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\t{\mat{R}_i}\d\mat{\beta}) \\
-       &\hspace{3em} -\frac{\tilde{s}}{2}2\sum_{i = 1}^n \tr\!\Big(\t{\mat{f}_{y_i}}\t{\mat{\beta }}(\mat{R}_i \widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1} + \widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1} \mat{R}_i - \widetilde{\mat{\Delta}}_2^{-1} \mat{R}_i \widetilde{\mat{\Delta}}_1^{-1})\d\mat{\alpha} \\
-       &\hspace{3em} \hspace{4.7em} + \mat{f}_{y_i} \t{\mat{\alpha}}(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i} + \t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1} - \widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1})\d\mat{\beta }\Big) \\
-%
-       &= \sum_{i = 1}^n \tr\bigg(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\Big(
-            -p q \tilde{s}^{-1} \mat{R}_i + p \mat{R}_i\widetilde{\mat{\Delta}}_1^{-1} + q \widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i + \tr(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1)\mat{R}_i \\
-       &\hspace{3em} \hspace{4.7em} - \tilde{s}(\mat{R}_i \widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1} + \widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1} \mat{R}_i - \widetilde{\mat{\Delta}}_2^{-1} \mat{R}_i \widetilde{\mat{\Delta}}_1^{-1})
-       \Big)\d\mat{\alpha}\bigg) \\
-       &\hspace{3em}+ \sum_{i = 1}^n \tr\bigg(\mat{f}_{y_i}\t{\mat{\alpha}}\Big(
-            -p q \tilde{s}^{-1} \t{\mat{R}_i} + p \widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i} + q \t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1} + \tr(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1)\t{\mat{R}_i} \\
-       &\hspace{3em}\hspace{3em} \hspace{4.7em} - \tilde{s}(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i} + \t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1} - \widetilde{\mat{\Delta}}_1^{-1} \t{\mat{R}_i} \widetilde{\mat{\Delta}}_2^{-1})
-       \Big)\d\mat{\beta}\bigg).
-\end{align*}
-Observe that the bracketed expressions before $\d\mat{\alpha}$ and $\d\mat{\beta}$ are transposes. Lets denote the expression for $\d\mat{\alpha}$ as $\mat{G}_i$ which has the form
-\begin{displaymath}
-    \mat{G}_i
-        = (\tr(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1) - p q \tilde{s}^{-1})\mat{R}_i
-        + (q\mat{I}_p - \tilde{s}\widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2)\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i
-        + \mat{R}_i\widetilde{\mat{\Delta}}_1^{-1}(p\mat{I}_q - \tilde{s}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1})
-        + \tilde{s}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i\widetilde{\mat{\Delta}}_1^{-1}
-\end{displaymath}
-and with $\mathcal{G}$ the order 3 tensor stacking the $\mat{G}_i$'s such that the first mode indexes the observation
-\begin{displaymath}
-    \ten{G}
-        = (\tr(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1) - p q \tilde{s}^{-1})\ten{R}
-        + \ten{R}\ttm[2](q\mat{I}_p - \tilde{s}\widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2)\widetilde{\mat{\Delta}}_2^{-1}
-        + \ten{R}\ttm[3](p\mat{I}_q - \tilde{s}\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1)\widetilde{\mat{\Delta}}_1^{-1}
-        + \tilde{s}\ten{R}\ttm[2]\widetilde{\mat{\Delta}}_2^{-1}\ttm[3]\widetilde{\mat{\Delta}}_1^{-1}
-\end{displaymath}
-This leads to the following form of the differential of $\tilde{l}$ given by
-\begin{displaymath}
-    \d\tilde{l}(\mat{\alpha}, \mat{\beta})
-        = \sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\mat{G}_i\d\mat{\alpha})
-        + \sum_{i = 1}^n \tr(\mat{f}_{y_i}\t{\mat{\alpha}}\t{\mat{G}_i}\d\mat{\beta})
-\end{displaymath}
-and therefore the gradients
-\begin{align*}
-    \nabla_{\mat{\alpha}}\tilde{l}(\mat{\alpha}, \mat{\beta}) &= \sum_{i = 1}^n \t{\mat{G}_i}\mat{\beta}\mat{f}_{y_i}
-        = \ten{G}_{(3)}\t{(\ten{F}\ttm[2]\mat{\beta})_{(3)}}, \\
-    \nabla_{\mat{\beta}} \tilde{l}(\mat{\alpha}, \mat{\beta}) &= \sum_{i = 1}^n \mat{G}_i\mat{\alpha}\t{\mat{f}_{y_i}}
-        = \ten{G}_{(2)}\t{(\ten{F}\ttm[3]\mat{\alpha})_{(2)}}.
-\end{align*}
-
-\todo{check the tensor version of the gradient!!!}
-
-\newpage
-
-\section{Thoughts on initial value estimation}
-\todo{This section uses an alternative notation as it already tries to generalize to general multi-dimensional arrays. Furthermore, one of the main differences is that the observation are indexed in the \emph{last} mode. The benefit of this is that the mode product and parameter matrix indices match not only in the population model but also in sample versions.}
-Let $\ten{X}, \ten{F}$ be order (rank) $r$ tensors of dimensions $p_1\times ... \times p_r$ and $q_1\times ... \times q_r$, respectively. Also denote the error tensor $\epsilon$ of the same order and dimensions as $\ten{X}$. The considered model for the $i$'th observation is
-\begin{displaymath}
-    \ten{X}_i = \ten{\mu} + \ten{F}_i\times\{ \mat{\alpha}_1, ..., \mat{\alpha}_r \} + \ten{\epsilon}_i
-\end{displaymath}
-where we assume $\ten{\epsilon}_i$ to be i.i.d. mean zero tensor normal distributed $\ten{\epsilon}\sim\mathcal{TN}(0, \mat{\Delta}_1, ..., \mat{\Delta}_r)$ for $\mat{\Delta}_j\in\mathcal{S}^{p_j}_{++}$, $j = 1, ..., r$. Given $i = 1, ..., n$ observations the collected model containing all observations
-\begin{displaymath}
-    \ten{X} = \ten{\mu} + \ten{F}\times\{ \mat{\alpha}_1, ..., \mat{\alpha}_r, \mat{I}_n \} + \ten{\epsilon}
-\end{displaymath}
-which is almost identical as the observations $\ten{X}_i, \ten{F}_i$ are stacked on an addition $r + 1$ mode leading to response, predictor and error tensors $\ten{X}, \ten{F}$ of order (rank) $r + 1$ and dimensions $p_1\times...\times p_r\times n$ for $\ten{X}, \ten{\epsilon}$ and $q_1\times...\times q_r\times n$ for $\ten{F}$.
-
-In the following we assume w.l.o.g that $\ten{\mu} = 0$, as if this is not true we simply replace $\ten{X}_i$ with $\ten{X}_i - \ten{\mu}$ for $i = 1, ..., n$ before collecting all the observations in the response tensor $\ten{X}$.
-
-The goal here is to find reasonable estimates for $\mat{\alpha}_j$, $j = 1, ..., n$ for the mean model
-\begin{displaymath}
-    \E \ten{X}|\ten{F}, \mat{\alpha}_1, ..., \mat{\alpha}_r = \ten{F}\times\{\mat{\alpha}_1, ..., \mat{\alpha}_r, \mat{I}_n\}
-        = \ten{F}\times_{j\in[r]}\mat{\alpha}_j.
-\end{displaymath}
-Under the mean model we have using the general mode product relation $(\ten{A}\times_j\mat{B})_{(j)} = \mat{B}\ten{A}_{(j)}$ we get
-\begin{align*}
-    \ten{X}_{(j)}\t{\ten{X}_{(j)}} \overset{\text{SVD}}{=} \mat{U}_j\mat{D}_j\t{\mat{U}_j}
-        = \mat{\alpha}_j(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}
-          \t{(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}}\t{\mat{\alpha}_j}
-\end{align*}
-for the $j = 1, ..., r$ modes. Using this relation we construct an iterative estimation process by setting the initial estimates of $\hat{\mat{\alpha}}_j^{(0)} = \mat{U}_j[, 1:q_j]$ which are the first $q_j$ columns of $\mat{U}_j$.
-
-For getting least squares estimates for $\mat{\alpha}_j$, $j = 1, ..., r$ we observe that by matricization of the mean model
-\begin{displaymath}
-    \ten{X}_{(j)} = (\ten{F}\times_{k\in[r]}\mat{\alpha}_k)_{(j)} = \mat{\alpha}_j(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}
-\end{displaymath}
-leads to normal equations for each $\mat{\alpha}_j$, $j = 1, ..., r$
-\begin{displaymath}
-    \ten{X}_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}} = \mat{\alpha}_j(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}}
-\end{displaymath}
-where the normal equations for $\mat{\alpha}_j$ depend on all the other $\mat{\alpha}_k$. With the initial estimates from above this allows an alternating approach. Index with $t = 1, ...$ the current iteration, then a new estimate $\widehat{\mat{\alpha}}_j^{(t)}$ given the previous estimates $\widehat{\mat{\alpha}}_k^{(t-1)}$, $k = 1, ..., r$ is computed as
-\begin{displaymath}
-    \widehat{\mat{\alpha}}_j^{(t)} =
-        \ten{X}_{(j)}
-        \t{\big(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k^{(t-1)}\big)_{(j)}}
-    \left(
-        \big(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k^{(t-1)}\big)_{(j)}
-        \t{\big(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k^{(t-1)}\big)_{(j)}}
-    \right)^{-1}
-\end{displaymath}
-for $j = 1, ..., r$ until convergence or a maximum number of iterations is exceeded. The final estimates are the least squares estimates by this procedure.
-
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%%                            Numerical Examples                            %%%
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\section{Numerical Examples}
-% The first example (which by it self is \emph{not} exemplary) is the estimation with parameters $n = 200$, $p = 11$, $q = 5$, $k = 14$ and $r = 9$. The ``true'' matrices $\mat\alpha$, $\mat\beta$ generated by sampling there elements i.i.d. standard normal like the responses $y$. Then, for each observation, $\mat{f}_y$ is computed as $\sin(s_{i, j} y + o_{i j})$ \todo{ properly describe} to fill the elements of $\mat{f}_y$. Then the $\mat{X}$'s are samples as
-% \begin{displaymath}
-%     \mat{X} = \mat{\beta}\mat{f}_y \t{\mat{\alpha}} + \mat{\epsilon}, \qquad \vec{\mat{\epsilon}} \sim \mathbb{N}_{p q}(\mat{0}, \mat{\Delta})
-% \end{displaymath}
-% where $\mat{\Delta}_{i j} = 0.5^{|i - j|}$ for $i, j = 1, ..., p q$.
-
-\begin{table}[!ht]
-    \centering
-    % see: https://en.wikibooks.org/wiki/LaTeX/Tables
-    \begin{tabular}{ll|r@{ }l *{3}{r@{.}l}}
-          method & init
-            & \multicolumn{2}{c}{loss}
-            & \multicolumn{2}{c}{MSE}
-            & \multicolumn{2}{c}{$\dist(\hat{\mat\alpha}, \mat\alpha)$}
-            & \multicolumn{2}{c}{$\dist(\hat{\mat\beta}, \mat\beta)$}
-            \\ \hline
-          base     &  vlp & -2642&(1594) & 1&82  (2.714) & 0&248 (0.447) & 0&271 (0.458) \\
-          new      &  vlp & -2704&(1452) & 1&78  (2.658) & 0&233 (0.438) & 0&260 (0.448) \\
-          new      &   ls & -3479&  (95) & 0&99  (0.025) & 0&037 (0.017) & 0&035 (0.015) \\
-          momentum &  vlp & -2704&(1452) & 1&78  (2.658) & 0&233 (0.438) & 0&260 (0.448) \\
-          momentum &   ls & -3479&  (95) & 0&99  (0.025) & 0&037 (0.017) & 0&035 (0.015) \\
-          approx   &  vlp &  6819&(1995) & 3&99 (12.256) & 0&267 (0.448) & 0&287 (0.457) \\
-          approx   &   ls &  5457& (163) & 0&99  (0.025) & 0&033 (0.017) & 0&030 (0.012) \\
-    \end{tabular}
-    \caption{Mean (standard deviation) for simulated runs of $20$ repititions for the model $\mat{X} = \mat{\beta}\mat{f}_y\t{\mat{\alpha}}$ of dimensinos $(p, q) = (11, 7)$, $(k, r) = (3, 5)$ with a sample size of $n = 200$. The covariance structure is $\mat{\Delta} = \mat{\Delta}_2\otimes \mat{\Delta}_1$ for $\Delta_i = \text{AR}(\sqrt{0.5})$, $i = 1, 2$. The functions applied to the standard normal response $y$ are $\sin, \cos$ with increasing frequency.}
-\end{table}
-
-% \begin{figure}
-%     \centering
-%     \includegraphics{loss_Ex01.png}
-% \end{figure}
-% \begin{figure}
-%     \centering
-%     \includegraphics{estimates_Ex01.png}
-% \end{figure}
-% \begin{figure}
-%     \centering
-%     \includegraphics{Delta_Ex01.png}
-% \end{figure}
-% \begin{figure}
-%     \centering
-%     \includegraphics{hist_Ex01.png}
-% \end{figure}
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%%                              Bib and Index                               %%%
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\printindex
-\nocite{*}
-\printbibliography
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%%                                 Appendix                                 %%%
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\appendix
-\section{Matrix Differential Rules}
-Let $\mat A$ be a square matrix (and invertible if needed) and $|.|$ stands for the determinant
-\begin{align*}
-    \d\log\mat A &= \frac{1}{|\mat A|}\d\mat{A} \\
-    \d|\mat A| &= |\mat A|\tr \mat{A}^{-1}\d\mat A \\
-    \d\log|\mat A| &= \tr\mat{A}^{-1}\d\mat A \\
-    \d\mat{X}^{-1} &= -\mat{X}^{-1}(\d\mat{X})\mat{X}^{-1}
-\end{align*}
-
-\section{Useful Matrix Identities}
-In this section we summarize a few useful matrix identities, for more details see for example \cite{MatrixAlgebra-AbadirMagnus2005}.
-
-For two matrices $\mat A$ of dimensions $q\times r$ and $\mat B$ of dimensions $p\times k$ holds
-\begin{equation}\label{eq:vecKron}
-    \vec(\mat A\kron\mat B) = (\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p)(\vec\mat A\kron\vec\mat B).
-\end{equation}
-
-Let $\mat A$ be a $p\times p$ dimensional non-singular matrix. Furthermore, let $\mat a, \mat b$ be $p$ vectors such that $\t{\mat b}A^{-1}\mat a\neq -1$, then
-\begin{displaymath}
-    (\mat A + \mat a\t{\mat b})^{-1} = \mat{A}^{-1} - \frac{1}{1 + \t{\mat b}A^{-1}\mat a}\mat{A}^{-1}\mat{a}\t{\mat{b}}\mat{A}^{-1}
-\end{displaymath}
-as well as
-\begin{displaymath}
-    \det(\mat A + \mat a\t{\mat b}) = \det(\mat A)(1 + \t{\mat b}{\mat A}^{-1}\mat a)
-\end{displaymath}
-which even holds in the case $\t{\mat b}A^{-1}\mat a = -1$. This is known as Sylvester's determinant theorem.
-
-
-\section{Commutation Matrix and Permutation Identities}
-\begin{center}
-    Note: In this section we use 0-indexing for the sake of simplicity!
-\end{center}
-In this section we summarize relations between the commutation matrix and corresponding permutation. We also list some extensions to ``simplify'' or represent some term. This is mostly intended for implementation purposes and understanding of terms occurring in the computations.
-
-Let $\mat A$ be an arbitrary $p\times q$ matrix. The permutation matrix $\mat K_{p, q}$ satisfies
-\begin{displaymath}
-    \mat{K}_{p, q}\vec{\mat{A}} = \vec{\t{\mat{A}}} \quad\Leftrightarrow\quad (\vec{\mat{A}})_{\pi_{p, q}(i)} = (\vec{\t{\mat{A}}})_{i}, \quad\text{for } i = 0, ..., p q - 1
-\end{displaymath}
-where $\pi_{p, q}$ is a permutation of the indices $i = 0, ..., p q - 1$ such that
-\begin{displaymath}
-    \pi_{p, q}(i + j p) = j + i q, \quad\text{for }i = 0, ..., p - 1; j = 0, ..., q - 1.
-\end{displaymath}
-
-\begin{table}[!htp]
-    \centering
-    \begin{minipage}{0.8\textwidth}
-        \centering
-        \begin{tabular}{l c l}
-            $\mat{K}_{p, q}$                 & $\hat{=}$ & $\pi_{p, q}(i + j p) = j + i q$ \\
-            $\mat{I}_r\kron\mat{K}_{p, q}$ & $\hat{=}$ & $\tilde{\pi}_{p, q, r}(i + j p + k p q) = j + i q + k p q$ \\
-            $\mat{K}_{p, q}\kron\mat{I}_r$ & $\hat{=}$ & $\hat{\pi}_{p, q, r}(i + j p + k p q) = r(j + i q) + k$
-        \end{tabular}
-    \caption{\label{tab:commutation-permutation}Commutation matrix terms and corresponding permutations. Indices are all 0-indexed with the ranges; $i = 0, ..., p - 1$, $j = 0, ..., q - 1$ and $k = 0, ..., r - 1$.}
-    \end{minipage}
-\end{table}
-
-
-
-\section{Matrix and Tensor Operations}
-
-The \emph{Kronecker product}\index{Operations!Kronecker@$\kron$ Kronecker product} is denoted as $\kron$ and the \emph{Hadamard product} uses the symbol $\circ$. We also need the \emph{Khatri-Rao product}\index{Operations!KhatriRao@$\hada$ Khatri-Rao product}
-$\hada$ as well as the \emph{Transposed Khatri-Rao product} $\odot_t$ (or \emph{Face-Splitting product}). There is also the \emph{$n$-mode Tensor Matrix Product}\index{Operations!ttm@$\ttm[n]$ $n$-mode tensor product} denoted by $\ttm[n]$ in conjunction with the \emph{$n$-mode Matricization} of a Tensor $\mat{T}$ written as $\mat{T}_{(n)}$, which is a matrix. See below for definitions and examples of these operations.\todo{ Definitions and Examples}
-
-\todo{ resolve confusion between Khatri-Rao, Column-wise Kronecker / Khatri-Rao, Row-wise Kronecker / Khatri-Rao, Face-Splitting Product, .... Yes, its a mess.}
-\paragraph{Kronecker Product $\kron$:}
-\paragraph{Khatri-Rao Product $\hada$:}
-\paragraph{Transposed Khatri-Rao Product $\odot_t$:} This is also known as the Face-Splitting Product and is the row-wise Kronecker product of two matrices. If relates to the Column-wise Kronecker Product through
-\begin{displaymath}
-    \t{(\mat{A}\odot_{t}\mat{B})} = \t{\mat{A}}\hada\t{\mat{B}}
-\end{displaymath}
-
-\paragraph{$n$-mode unfolding:} \emph{Unfolding}, also known as \emph{flattening} or \emph{matricization}, is an reshaping of a tensor into a matrix with rearrangement of the elements such that mode $n$ corresponds to columns of the result matrix and all other modes are vectorized in the rows. Let $\ten{T}$ be a tensor of order $m$ with dimensions $t_1\times ... \times t_n\times ... \times t_m$ and elements indexed by $(i_1, ..., i_n, ..., i_m)$. The $n$-mode flattening, denoted $\ten{T}_{(n)}$, is defined as a $(t_n, \prod_{k\neq n}t_k)$ matrix with element indices $(i_n, j)$ such that $j = \sum_{k = 1, k\neq n}^m i_k\prod_{l = 1, l\neq n}^{k - 1}t_l$.
-\todo{ give an example!}
-
-\paragraph{$n$-mode Tensor Product $\ttm[n]$:}
-The \emph{$n$-mode tensor product} $\ttm[n]$ between a tensor $\mat{T}$ of order $m$ with dimensions $t_1\times t_2\times ... \times t_n\times ... \times t_m$ and a $p\times t_n$ matrix $\mat{M}$ is defined element-wise as
-\begin{displaymath}
-    (\ten{T}\ttm[n] \mat{M})_{i_1, ..., i_{n-1}, j, i_{n+1}, ..., i_m} = \sum_{k = 1}^{t_n} \ten{T}_{i_1, ..., i_{n-1}, k, i_{n+1}, ..., i_m} \mat{M}_{j, k}
-\end{displaymath}
-where $i_1, ..., i_{n-1}, i_{n+1}, ..., i_m$ run from $1$ to $t_1, ..., t_{n-1}, t_{n+1}, ..., t_m$, respectively. Furthermore, the $n$-th fiber index $j$ of the product ranges from $1$ to $p$. This gives a new tensor $\mat{T}\ttm[n]\mat{M}$ of order $m$ with dimensions $t_1\times t_2\times ... \times p\times ... \times t_m$.
-
-\begin{example}[Matrix Multiplication Analogs]
-    Let $\mat{A}$, $\mat{B}$ be two matrices with dimensions $t_1\times t_2$ and $p\times q$, respectively. Then $\mat{A}$ is also a tensor of order $2$, now the $1$-mode and $2$-mode products are element wise given by
-    \begin{align*}
-        (\mat{A}\ttm[1] \mat{B})_{i,j} &= \sum_{l = 1}^{t_1} \mat{A}_{l,j}\mat{B}_{i,l}
-            = (\mat{B}\mat{A})_{i,j}
-                                        & \text{for }t_1 = q, \\
-        (\mat{A}\ttm[2] \mat{B})_{i,j} &= \sum_{l = 1}^{t_2} \mat{A}_{i,l}\mat{B}_{j,l}
-            = (\mat{A}\t{\mat{B}})_{i,j} = \t{(\mat{B}\t{\mat{A}})}_{i,j}
-                                        & \text{for }t_2 = q.
-    \end{align*}
-    In other words, the $1$-mode product equals $\mat{A}\ttm[1] \mat{B} = \mat{B}\mat{A}$ and the $2$-mode is $\mat{A}\ttm[2] \mat{B} = \t{(\mat{B}\t{\mat{A}})}$ in the case of the tensor $\mat{A}$ being a matrix.
-\end{example}
-
-\begin{example}[Order Three Analogs]
-    Let $\mat{A}$ be a tensor of the form $t_1\times t_2\times t_3$ and $\mat{B}$ a matrix of dimensions $p\times q$, then the $n$-mode products have the following look
-    \begin{align*}
-        (\mat{A}\ttm[1]\mat{B})_{i,j,k} &= \sum_{l = 1}^{t_1} \mat{A}_{l,j,k}\mat{B}_{i,l} & \text{for }t_1 = q, \\
-        (\mat{A}\ttm[2]\mat{B})_{i,j,k} &= \sum_{l = 1}^{t_2} \mat{A}_{i,l,k}\mat{B}_{j,l} \equiv (\mat{B}\mat{A}_{i,:,:})_{j,k} & \text{for }t_2 = q, \\
-        (\mat{A}\ttm[3]\mat{B})_{i,j,k} &= \sum_{l = 1}^{t_3} \mat{A}_{i,j,l}\mat{B}_{k,l} \equiv \t{(\mat{B}\t{\mat{A}_{i,:,:}})}_{j,k} & \text{for }t_3 = q.
-    \end{align*}
-\end{example}
-
-Letting $\ten{F}$ be the $3$-tensor of dimensions $n\times k\times r$ such that $\ten{F}_{i,:,:} = \mat{f}_{y_i}$, then
-\begin{displaymath}
-    \mat{\beta}\mat{f}_{y_i}\t{\mat{\alpha}} = (\ten{F}\ttm[2]\mat{\beta}\ttm[3]\mat{\alpha})_{i,:,:}
-\end{displaymath}
-or in other words, the $i$-th slice of the tensor product $\ten{F}\ttm[2]\mat{\beta}\ttm[3]\mat{\alpha}$ contains $\mat{\beta}\mat{f}_{y_i}\t{\mat{\alpha}}$ for $i = 1, ..., n$.
-Another analog way of writing this is
-\begin{displaymath}
-    (\ten{F}\ttm[2]\mat{\beta}\ttm[3]\mat{\alpha})_{(1)} = \mathbb{F}_{y}(\t{\mat{\alpha}}\kron\t{\mat{\beta}})
-\end{displaymath}
-
-\section{Equivalencies}
-In this section we give a short summary of alternative but equivalent operations.
-Using the notation $\widehat{=}$ to indicate that two expressions are identical in the sense that they contain the same element in the same order but may have different dimensions. Meaning, when vectorizing ether side of $\widehat{=}$, they are equal ($\mat{A}\widehat{=}\mat{B}\ :\Leftrightarrow\ \vec{\mat{A}} = \vec{\mat{B}}$).
-
-Therefore, we use $\mat{A}, \mat{B}, \mat{X}, \mat{F}, \mat{R}, ...$ for matrices. 3-Tensors are written as $\ten{A}, \ten{B}, \ten{T}, \ten{X}, \ten{F}, \ten{R}, ...$.
-
-\begin{align*}
-    \ten{T}\ttm[3]\mat{A}\ &{\widehat{=}}\ \mat{T}\t{\mat A} & \ten{T}(n, p, q)\ \widehat{=}\ \mat{T}(n p, q), \mat{A}(p, q) \\
-    \ten{T}\ttm[2]\mat{B}\ &{\widehat{=}}\ \mat{B}\ten{T}_{(2)} & \ten{T}(n, p, q), \ten{T}_{(2)}(p, n q), \mat{B}(q, p)
-\end{align*}
-
-% \section{Matrix Valued Normal Distribution}
-% A random variable $\mat{X}$ of dimensions $p\times q$ is \emph{Matrix-Valued Normal Distribution}, denoted 
-% \begin{displaymath}
-%     \mat{X}\sim\mathcal{MN}_{p\times q}(\mat{\mu}, \mat{\Delta}_2, \mat{\Delta}_1),
-% \end{displaymath}
-% if and only if $\vec\mat{X}\sim\mathcal{N}_{p q}(\vec\mat\mu, \mat\Delta_1\otimes\mat\Delta_2)$. Note the order of the covariance matrices $\mat\Delta_1, \mat\Delta_2$. Its density is given by
-% \begin{displaymath}
-%     f(\mat{X}) = \frac{1}{(2\pi)^{p q / 2}|\mat\Delta_1|^{p / 2}|\mat\Delta_2|^{q / 2}}\exp\left(-\frac{1}{2}\tr(\mat\Delta_1^{-1}\t{(\mat X - \mat \mu)}\mat\Delta_2^{-1}(\mat X - \mat \mu))\right).
-% \end{displaymath}
-
-% \section{Sampling form a Multi-Array Normal Distribution}
-% Let $\ten{X}$ be an order (rank) $r$ Multi-Array random variable of dimensions $p_1\times...\times p_r$ following a Multi-Array (or Tensor) Normal distributed
-% \begin{displaymath}
-%     \ten{X}\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r).
-% \end{displaymath}
-% Its density is given by
-% \begin{displaymath}
-%     f(\ten{X}) = \Big( \prod_{i = 1}^r \sqrt{(2\pi)^{p_i}|\mat{\Delta}_i|^{q_i}} \Big)^{-1}
-%         \exp\!\left( -\frac{1}{2}\langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle \right)
-% \end{displaymath}
-% with $q_i = \prod_{j \neq i}p_j$. This is equivalent to the vectorized $\vec\ten{X}$ following a Multi-Variate Normal distribution
-% \begin{displaymath}
-%     \vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1)
-% \end{displaymath}
-% with $p = \prod_{i = 1}^r p_i$.
-
-% \todo{Check this!!!}
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%%                           Reference Summaries                            %%%
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-\section{Reference Summaries}
-This section contains short summaries of the main references with each sub-section concerning one paper.
-
-\subsection{}
-
-\subsection{Generalized Tensor Decomposition With Features on Multiple Modes}
-The \cite{TensorDecomp-HuLeeWang2022} paper proposes a multi-linear conditional mean model for a constraint rank tensor decomposition. Let the responses $\ten{Y}\in\mathbb{R}^{d_1\times ... \times\d_K}$ be an order $K$ tensor. Associated with each mode $k\in[K]$ they assume feature matrices $\mat{X}_k\in\mathbb{R}^{d_k\times p_k}$. Now, they assume that conditional on the feature matrices $\mat{X}_k$ the entries of the tensor $\ten{Y}$ are independent realizations. The rank constraint is specified through $\mat{r} = (r_1, ..., r_K)$, then the model is given by
-\begin{displaymath}
-    \E(\ten{Y} | \mat{X}_1, ..., \mat{X}_K) = f(\ten{C}\times\{ \mat{X}_1\mat{M}_1, ..., \mat{X}_K\mat{M}_K \}),\qquad \t{\mat{M}_k}\mat{M}_k = \mat{I}_{r_k}\ \forall k\in[K].
-\end{displaymath}
-The order $K$ tensor $\ten{C}\in\mathbb{R}^{r_1\times...\times r_K}$ is an unknown full-rank core tensor and the matrices $\mat{M}_k\in\mathbb{R}^{p_k\times r_k}$ are unknown factor matrices. The function $f$ is applied element wise and serves as the link function based on the assumed distribution family of the tensor entries. Finally, the operation $\times$ denotes the tensor-by-matrix product using a short hand
-\begin{displaymath}
-    \ten{C}\times\{ \mat{X}_1\mat{M}_1, ..., \mat{X}_K\mat{M}_K \}
-        = \ten{C}\ttm[1]\mat{X}_1\mat{M}_1\ ...\ttm[K]\mat{X}_K\mat{M}_K
-\end{displaymath}
-with $\ttm[k]$ denoting the $k$-mode tensor matrix product.
-
-The algorithm for estimation of $\ten{C}$ and $\mat{M}_1, ..., \mat{M}_K$ assumes the individual conditional entries of $\ten{Y}$ to be independent and to follow a generalized linear model with link function $f$. The proposed algorithm is an iterative algorithm for minimizing the negative log-likelihood
-\begin{displaymath}
-    l(\ten{C}, \mat{M}_1, ..., \mat{M}_K) = \langle \ten{Y}, \Theta \rangle - \sum_{i_1, ..., i_K} b(\Theta_{i_1, ..., i_K}), \qquad \Theta = \ten{C}\times\{ \mat{X}_1\mat{M}_1, ..., \mat{X}_K\mat{M}_K \}
-\end{displaymath}
-where $b = f'$ it the derivative of the canonical link function $f$ in the generalized linear model the conditioned entries of $\ten{Y}$ follow. The algorithm utilizes the higher-order SVD (HOSVD) to enforce the rank-constraint.
-
-The main benefit is that this approach generalizes well to a multitude of different structured data sets.
-
-\todo{ how does this relate to the $\mat{X} = \mat{\mu} + \mat{\beta}\mat{f}_y\t{\mat{\alpha}} + \mat{\epsilon}$ model.}
-
-\end{document}
diff --git a/LaTeX/main.tex b/LaTeX/main.tex
deleted file mode 100644
index b569869..0000000
--- a/LaTeX/main.tex
+++ /dev/null
@@ -1,1196 +0,0 @@
-\documentclass[a4paper, 10pt]{article}
-
-\usepackage[utf8]{inputenc}
-\usepackage[T1]{fontenc}
-\usepackage{fullpage}
-\usepackage{amsmath, amssymb, amstext, amsthm}
-\usepackage{bm}                         % \boldsymbol and italic corrections, ...
-\usepackage[pdftex]{hyperref}
-\usepackage{makeidx}                    % Index (Symbols, Names, ...)
-\usepackage{xcolor, graphicx}           % colors and including images
-\usepackage{tikz}
-\usetikzlibrary{calc}
-\usepackage[
-    % backend=bibtex,
-    style=authoryear-comp
-]{biblatex}
-\usepackage{algorithm, algpseudocode}   % Pseudo Codes / Algorithms
-
-% Document meta into
-\title{Higher Order Parametric Inverse Regression HO-PIR}
-\author{Daniel Kapla}
-\date{\today}
-% Set PDF title, author and creator.
-\AtBeginDocument{
-    \hypersetup{
-        pdftitle = {Higher Order Parametric Inverse Regression HO-PIR},
-        pdfauthor = {Daniel Kapla},
-        pdfcreator = {\pdftexbanner}
-    }
-}
-
-\makeindex
-
-% Bibliography resource(s)
-\addbibresource{main.bib}
-
-% Setup environments
-% Theorem, Lemma
-\theoremstyle{plain}
-\newtheorem{theorem}{Theorem}
-\newtheorem{lemma}{Lemma}
-\newtheorem{example}{Example}
-% Definition
-\theoremstyle{definition}
-\newtheorem{defn}{Definition}
-% Remark
-\theoremstyle{remark}
-\newtheorem{remark}{Remark}
-
-% Define math macros
-\newcommand{\mat}[1]{\boldsymbol{#1}}
-\newcommand{\ten}[1]{\mathcal{#1}}
-\renewcommand{\vec}{\operatorname{vec}}
-\newcommand{\dist}{\operatorname{dist}}
-\newcommand{\rank}{\operatorname{rank}}
-\DeclareMathOperator{\kron}{\otimes}    % Kronecker Product
-\DeclareMathOperator{\hada}{\odot}      % Hadamard Product
-\newcommand{\ttm}[1][n]{\times_{#1}}    % n-mode product (Tensor Times Matrix)
-\DeclareMathOperator{\df}{df}
-\DeclareMathOperator{\tr}{tr}
-\DeclareMathOperator{\var}{Var}
-\DeclareMathOperator{\cov}{Cov}
-\DeclareMathOperator{\Span}{Span}
-\DeclareMathOperator{\E}{\operatorname{\mathbb{E}}}
-% \DeclareMathOperator{\independent}{{\bot\!\!\!\bot}}
-\DeclareMathOperator*{\argmin}{{arg\,min}}
-\DeclareMathOperator*{\argmax}{{arg\,max}}
-\newcommand{\D}{\textnormal{D}}         % derivative
-\renewcommand{\d}{\textnormal{d}}       % differential
-\renewcommand{\t}[1]{{#1^{\prime}}}     % matrix transpose
-\newcommand{\pinv}[1]{{#1^{\dagger}}}   % `Moore-Penrose pseudoinverse`
-
-\newcommand{\todo}[1]{{\color{red}TODO: #1}}
-\newcommand{\effie}[1]{{\color{blue}Effie: #1}}
-
-% Pseudo Code Commands
-\newcommand{\algorithmicbreak}{\textbf{break}}
-\newcommand{\Break}{\State \algorithmicbreak}
-
-\begin{document}
-
-\maketitle
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%%                               Preliminary                                %%%
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\section{Notation}
-Vectors are write as boldface lowercase letters (e.g. $\mat a$, $\mat b$), matrices use boldface uppercase or Greek letters (e.g. $\mat A$, $\mat B$, $\mat\alpha$, $\mat\Delta$). The identity matrix of dimensions $p\times p$ is denoted by $\mat{I}_p$ and the commutation matrix as $\mat{K}_{p, q}$ or $\mat{K}_p$ is case of $p = q$. Tensors, meaning multi-dimensional arrays of order at least 3, use uppercase calligraphic letters (e.g. $\ten{A}$, $\ten{B}$, $\ten{X}$, $\ten{Y}$, $\ten{F}$). Boldface indices (e.g. $\mat{i}, \mat{j}, \mat{k}$) denote multi-indices $\mat{i} = (i_1, ..., i_r)\in[\mat{d}]$ where the bracket notation is a shorthand for $[r] = \{1, ..., r\}$ which in conjunction with a multi-index as argument means $[\mat{d}] = [d_1]\times ... \times[d_K]$.
-
-Let $\ten{A} = (a_{i_1,...,i_r})\in\mathbb{R}^{d_1\times ...\times d_r}$ be an order\footnote{Also called rank, therefore the variable name $r$, but this term is \emph{not} used as it leads to confusion with the rank as in ``the rank of a matrix''.} $r$ tensor where $r\in\mathbb{N}$ is the number of modes or axis of $\ten{A}$. For matrices $\mat{B}_k\in\mathbb{R}^{p_k\times d_k}$ with $k\in[r] = \{1, 2, ..., r\}$ the \emph{multi-linear multiplication} is defined element wise as
-\begin{displaymath}
-    (\ten{A}\times\{\mat{B}_1, ..., \mat{B}_r\})_{j_1, ..., j_r} = \sum_{i_1, ..., i_r = 1}^{d_1, ..., d_r} a_{i_1, ..., i_r}(B_{1})_{j_1, i_1} \cdots (B_{r})_{j_r, i_r}
-\end{displaymath}
-which results in an order $r$ tensor of dimensions $p_1\times ...\times p_k)$. With this the \emph{$k$-mode product} between the tensor $\ten{A}$ with the matrix $\mat{B}_k$ is given by
-\begin{displaymath}
-    \mat{A}\times_k\mat{B}_k = \ten{A}\times\{\mat{I}_{d_1}, ..., \mat{I}_{d_{k-1}}, \mat{B}_{k}, \mat{I}_{d_{k+1}}, ..., \mat{I}_{d_r}\}.
-\end{displaymath}
-Furthermore, the notation $\ten{A}\times_{k\in S}$ is a short hand for writing the iterative application if the mode product for all indices in $S\subset[r]$. For example $\ten{A}\times_{k\in\{2, 5\}}\mat{B}_k = \ten{A}\times_2\mat{B}_2\times_5\mat{B}_5$. By only allowing $S$ to be a set this notation is unambiguous because the mode products commutes for different modes $j\neq k\Rightarrow\ten{A}\times_j\mat{B}_j\times_k\mat{B}_k = \ten{A}\times_k\mat{B}_k\times_j\mat{B}_j$.
-
-The \emph{inner product} between two tensors of the same order and dimensions is
-\begin{displaymath}
-    \langle\ten{A}, \ten{B}\rangle = \sum_{i_1, ..., i_r} a_{i_1, ..., i_r}b_{i_1, ..., i_r}
-\end{displaymath}
-with which the \emph{Frobenius Norm} $\|\ten{A}\|_F = \sqrt{\langle\ten{A}, \ten{A}\rangle}$. Of interest is also the \emph{maximum norm} $\|\ten{A}\|_{\infty} = \max_{i_1, ..., i_K} a_{i_1, ..., i_K}$. Furthermore, the Frobenius and maximum norm are also used for matrices while for a vector $\mat{a}$ the \emph{2 norm} is $\|\mat{a}\|_2 = \sqrt{\langle\mat{a}, \mat{a}\rangle}$.
-
-Matrices and tensor can be \emph{vectorized} by the \emph{vectorization} operator $\vec$. For tensors of order at least $2$ the \emph{flattening} (or \emph{unfolding} or \emph{matricization}) is a reshaping of the tensor into a matrix along an particular mode. For a tensor $\ten{A}$ of order $r$ and dimensions $d_1, ..., d_r$ the $k$-mode unfolding $\ten{A}_{(k)}$ is a $d_k\times \prod_{l=1, l\neq k}d_l$ matrix. For the tensor $\ten{A} = (a_{i_1,...,i_r})\in\mathbb{R}^{d_1, ..., d_r}$ the elements of the $k$ unfolded tensor $\ten{A}_{(k)}$ are
-\begin{displaymath}
-    (\ten{A}_{(k)})_{i_k, j} = a_{i_1, ..., i_r}\quad\text{ with }\quad j = 1 + \sum_{\substack{l = 1\\l \neq k}}^r (i_l - 1) \prod_{\substack{m = 1\\m\neq k}}^{l - 1}d_m.
-\end{displaymath}
-
-The rank of a tensor $\ten{A}$ of dimensions $d_1\times ...\times d_r$ is given by a vector $\rank{\ten{A}} = (a_1, ..., a_r)\in[d_1]\times...\times[d_r]$ where $a_k = \rank(\ten{A}_{(k)})$ is the usual matrix rank of the $k$ unfolded tensor.
-
-
-\todo{$\mathcal{S}^p$, $\mathcal{S}_{+}^p$, $\mathcal{S}_{++}^p$ symmetric matrices of dimensions $p\times p$, or call it $\operatorname{Sym}(p)$}
-
-\todo{The group of orthogonas matrices $O(p)$ of dim $p\times p$, where $O(p, q)$ are the $p\times q$ matrices (a.k.a. the Stiefel manifold)}
-
-
-\section{Tensor Normal Distribution}
-Let $\ten{X}$ be a multi-dimensional array random variable of order $r$ with dimensions $p_1\times ... \times p_r$ written as
-\begin{displaymath}
-    \ten{X}\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r).
-\end{displaymath}
-Its density is given by
-\begin{displaymath}
-    f(\ten{X}) = \Big( \prod_{i = 1}^r \sqrt{(2\pi)^{p_i}|\mat{\Delta}_i|^{p / p_i}} \Big)^{-1}
-        \exp\!\left( -\frac{1}{2}\langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle \right)
-\end{displaymath}
-where $p = \prod_{i = 1}^r p_i$. This is equivalent to the vectorized $\vec\ten{X}$ following a Multi-Variate Normal distribution
-\begin{displaymath}
-    \vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1).
-\end{displaymath}
-
-\begin{theorem}[Tensor Normal to Multi-Variate Normal equivalence]
-    For a multi-dimensional random variable $\ten{X}$ of order $r$ with dimensions $p_1\times ..., p_r$. Let $\ten{\mu}$ be the mean of the same order and dimensions as $\ten{X}$ and the mode covariance matrices $\mat{\Delta}_i$ of dimensions $p_i\times p_i$ for $i = 1, ..., n$. Then the tensor normal distribution is equivalent to the multi-variate normal distribution by the relation
-    \begin{displaymath}
-        \ten{X}\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r)
-            \qquad\Leftrightarrow\qquad
-        \vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes ...\otimes \mat{\Delta}_1)
-    \end{displaymath}
-    where $p = \prod_{i = 1}^r p_i$.
-\end{theorem}
-\begin{proof}
-    A straight forward way is to rewrite the Tensor Normal density as the density of a Multi-Variate Normal distribution depending on the vectorization of $\ten{X}$. First consider
-    \begin{align*}
-        \langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle
-            &= \t{\vec(\ten{X} - \mu)}\vec((\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\}) \\
-            &= \t{\vec(\ten{X} - \mu)}(\mat{\Delta}_r^{-1}\otimes ...\otimes\mat{\Delta}_1^{-1})\vec(\ten{X} - \mu) \\
-            &= \t{(\vec\ten{X} - \vec\mu)}(\mat{\Delta}_r\otimes ...\otimes\mat{\Delta}_1)^{-1}(\vec\ten{X} - \vec\mu).
-    \end{align*}
-    Next, using a property of the determinant of a Kronecker product $|\mat{\Delta}_1\otimes\mat{\Delta}_2| = |\mat{\Delta}_1|^{p_2}|\mat{\Delta}_2|^{p_1}$ yields
-    \begin{displaymath}
-        |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1|
-            = |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_2|^{p_1}|\mat{\Delta}_1|^{p / p_1}
-    \end{displaymath}
-    where $p = \prod_{j = 1}^r p_j$. By induction over $r$ the relation
-    \begin{displaymath}
-        |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1|
-            = \prod_{i = 1}^r |\mat{\Delta}_i|^{p / p_i}
-    \end{displaymath}
-    holds for arbitrary order $r$. Substituting into the Tensor Normal density leads to
-    \begin{align*}
-        f(\ten{X}) = \Big( (2\pi)^p |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1| \Big)^{-1/2}
-        \exp\!\left( -\frac{1}{2}\t{(\vec\ten{X} - \vec\mu)}(\mat{\Delta}_r\otimes ...\otimes\mat{\Delta}_1)^{-1}(\vec\ten{X} - \vec\mu) \right)
-    \end{align*}
-    which is the Multi-Variate Normal density of the $p$ dimensional vector $\vec\ten{X}$.
-\end{proof}
-
-When sampling from the Multi-Array Normal one way is to sample from the Multi-Variate Normal and then reshaping the result, but this is usually very inefficient because it requires to store the multi-variate covariance matrix which is very big. Instead, it is more efficient to sample $\ten{Z}$ as a tensor of the same shape as $\ten{X}$ with standard normal entries and then transform the $\ten{Z}$ to follow the Multi-Array Normal as follows
-\begin{displaymath}
-    \ten{Z}\sim\mathcal{TN}(0, \mat{I}_{p_1}, ..., \mat{I}_{p_r})
-        \quad\Rightarrow\quad
-    \ten{X} = \ten{Z}\times\{\mat{\Delta}_1^{1/2}, ..., \mat{\Delta}_r^{1/2}\} + \mu\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r).
-\end{displaymath}
-where the sampling from the standard Multi-Array Normal is done by sampling all of the elements of $\ten{Z}$ from a standard Normal.
-
-
-% \section{Introduction}
-% We assume the model
-% \begin{displaymath}
-%     \ten{X} = \ten{\mu} + \ten{F} \times \{ \mat{\alpha}_1, ..., \mat{\alpha}_r \} + \ten{\epsilon}
-% \end{displaymath}
-% where the dimensions of all the components are listed in Table~\ref{tab:dimensions}.
-% % and its vectorized form
-% % \begin{displaymath}
-% %     \vec\mat{X} = \vec\mat{\mu} + (\mat{\alpha}\kron\mat{\beta})\vec\mat{f}_y + \vec\mat{\epsilon}
-% % \end{displaymath}
-
-% \begin{table}[!htp]
-%     \centering
-%     \begin{minipage}{0.8\textwidth}
-%     \centering
-%     \begin{tabular}{l l l}
-%         $\mat X, \mat\mu, \mat R, \mat\epsilon$     & $p_1\times p_2$ \\
-%         $\ten X, \ten\mu, \ten R, \ten\epsilon$     & $p_1\times ...\times p_r$ & {\small\color{gray} (Population Level)} \\
-%         $\ten X, \ten\mu, \ten R, \ten\epsilon$     & $p_1\times ...\times p_r\times n$ & {\small\color{gray} (Sample Level)} \\
-%         $\mat{f}_y$     & $q_1\times q_2$ \\
-%         $\ten{F}$     & $q_1\times ...\times q_r$ & {\small\color{gray} (Population Level)} \\
-%         $\ten{F}$     & $q_1\times ...\times q_r\times n$ & {\small\color{gray} (Sample Level)} \\
-%         $\mat{\alpha}_j$                                & $p_j\times q_j$ & $j = 1, ..., r$ \\
-%         $\mat\Delta_j$                              & $q\times q$ & $j = 1, ..., r$
-%     \end{tabular}
-%     \caption{\label{tab:dimensions}Summary listing of dimensions.}
-%     \end{minipage}
-% \end{table}
-
-% The log-likelihood $l$ given $n$ i.i.d. observations assuming that $\mat{X}_i\mid(Y = y_i)$ is normal distributed as
-% \begin{displaymath}
-%     \vec\mat{X}_i \sim \mathcal{N}_{p q}(\vec\mat\mu + (\mat\alpha\kron\mat\beta)\vec\mat{f}_{y_i}, \Delta)
-% \end{displaymath}
-% Replacing all unknown by there estimates gives the (estimated) log-likelihood
-% \begin{equation}\label{eq:log-likelihood-est}
-%     \hat{l}(\mat\alpha, \mat\beta) = -\frac{n q p}{2}\log 2\pi - \frac{n}{2}\log|\widehat{\mat\Delta}| - \frac{1}{2}\sum_{i = 1}^n \t{\mat{r}_i}\widehat{\mat\Delta}^{-1}\mat{r}_i
-% \end{equation}
-% where the residuals are
-% \begin{displaymath}
-%     \mat{r}_i = \vec\mat{X}_i - \vec\overline{\mat{X}} - (\mat\alpha\kron\mat\beta)\vec{\mat f}_{y_i}\qquad (p q \times 1)
-% \end{displaymath}
-% and the MLE estimate assuming $\mat\alpha, \mat\beta$ known for the covariance matrix $\widehat{\mat\Delta}$ as solution to the score equations is
-% \begin{equation}\label{eq:Delta}
-%     \widehat{\mat\Delta} = \frac{1}{n}\sum_{i = 1}^n \mat{r}_i \t{\mat{r}_i} \qquad(p q \times p q).
-% \end{equation}
-% Note that the log-likelihood estimate $\hat{l}$ only depends on $\mat\alpha, \mat\beta$. Next, we compute the gradient for $\mat\alpha$ and $\mat\beta$ of $\hat{l}$ used to formulate a Gradient Descent base estimation algorithm for $\mat\alpha, \mat\beta$ as the previous algorithmic. The main reason is to enable an estimation for bigger dimensions of the $\mat\alpha, \mat\beta$ coefficients since the previous algorithm does \emph{not} solve the high run time problem for bigger dimensions.
-
-% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-% %%%                                Derivative                                %%%
-% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-% \section{Derivative of the Log-Likelihood}
-% Start with the general case of $\mat X_i|(Y_i = y_i)$ is multivariate normal distributed with the covariance $\mat\Delta$ being a $p q\times p q$ positive definite symmetric matrix \emph{without} an further assumptions. We have $i = 1, ..., n$ observations following
-% \begin{displaymath}
-%     \mat{r}_i = \vec(\mat X_i - \mat\mu - \mat\beta\mat{f}_{y_i}\t{\mat\alpha}) \sim \mathcal{N}_{p q}(\mat 0, \mat\Delta).
-% \end{displaymath}
-% The MLE estimates of $\mat\mu, \mat\Delta$ are
-% \begin{displaymath}
-%     \widehat{\mat\mu} = \overline{\mat X} = \frac{1}{n}\sum_{i = 1}^n \mat X_i {\color{gray}\qquad(p\times q)},
-%     \qquad \widehat{\mat\Delta} = \frac{1}{n}\sum_{i = 1}^n \mat r_i\t{\mat r_i} {\color{gray}\qquad(p q\times p q)}.
-% \end{displaymath}
-% Substitution of the MLE estimates into the log-likelihood $l(\mat\mu, \mat\Delta, \mat\alpha, \mat\beta)$ gives the estimated log-likelihood $\hat{l}(\mat\alpha, \mat\beta)$ as
-% \begin{displaymath}
-%     \hat{l}(\mat\alpha, \mat\beta) = -\frac{n q p}{2}\log 2\pi - \frac{n}{2}\log|\widehat{\mat\Delta}| - \frac{1}{2}\sum_{i = 1}^n \t{\mat{r}_i}\widehat{\mat\Delta}^{-1}\mat{r}_i.
-% \end{displaymath}
-% We are interested in the gradients $\nabla_{\mat\alpha}\hat{l}(\mat\alpha, \mat\beta)$, $\nabla_{\mat\beta}\hat{l}(\mat\alpha, \mat\beta)$ of the estimated log-likelihood. Therefore, we consider the differential of $\hat{l}$.
-% \begin{align}
-%     \d\hat{l}(\mat\alpha, \mat\beta)
-%         &= -\frac{n}{2}\log|\widehat{\mat{\Delta}}| - \frac{1}{2}\sum_{i = 1}^n \big(\t{(\d \mat{r}_i)}\widehat{\mat{\Delta}}^{-1} \mat{r}_i + \t{\mat{r}_i}(\d\widehat{\mat{\Delta}}^{-1}) \mat{r}_i + \t{\mat{r}_i}\widehat{\mat{\Delta}}^{-1} \d \mat{r}_i\big) \nonumber\\
-%         &= \underbrace{-\frac{n}{2}\log|\widehat{\mat{\Delta}}| - \frac{1}{2}\sum_{i = 1}^n \t{\mat{r}_i}(\d\widehat{\mat{\Delta}}^{-1}) \mat{r}_i}_{=\,0\text{ due to }\widehat{\mat{\Delta}}\text{ beeing the MLE}} \label{eq:deriv1}
-%             - \sum_{i = 1}^n \t{\mat{r}_i}\widehat{\mat{\Delta}}^{-1} \d \mat{r}_i.
-% \end{align}
-% The next step is to compute $\d \mat{r}_i$ which depends on both $\mat\alpha$ and $\mat\beta$
-% \begin{align*}
-%     \d\mat{r}_i(\mat\alpha, \mat\beta)
-%         &= -\d(\mat\alpha\kron \mat\beta)\vec\mat{f}_{y_i} \\
-%         &= -\vec\!\big( \mat{I}_{p q}\,\d(\mat\alpha\kron \mat\beta)\vec\mat{f}_{y_i} \big) \\
-%         &= -(\t{\vec(\mat{f}_{y_i})}\kron \mat{I}_{p q})\,\d\vec(\mat\alpha\kron \mat\beta) \\
-% \intertext{using the identity \ref{eq:vecKron}, to obtain vectorized differentials, gives}
-%     \dots
-%         &= -(\t{\vec(\mat{f}_{y_i})}\kron \mat{I}_{p q})(\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) \,\d(\vec \mat\alpha\kron \vec \mat\beta) \\
-%         &= -(\t{\vec(\mat{f}_{y_i})}\kron \mat{I}_{p q})(\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) \big((\d\vec \mat\alpha)\kron \vec \mat\beta + \vec \mat\alpha\kron (\d\vec \mat\beta)\big) \\
-%         &= -(\t{\vec(\mat{f}_{y_i})}\kron \mat{I}_{p q})(\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) \big(\mat{I}_{r q}(\d\vec \mat\alpha)\kron (\vec \mat\beta)\mat{I}_1 + (\vec \mat\alpha)\mat{I}_1\kron \mat{I}_{k p}(\d\vec \mat\beta)\big) \\
-%         &= -(\t{\vec(\mat{f}_{y_i})}\kron \mat{I}_{p q})(\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) \big((\mat{I}_{r q}\kron\vec \mat\beta)\d\vec \mat\alpha + (\vec \mat\alpha\kron \mat{I}_{k p})\d\vec \mat\beta\big)
-% \end{align*}
-% Now, substitution of $\d\mat{r}_i$ into \eqref{eq:deriv1} gives the gradients (not dimension standardized versions of $\D\hat{l}(\mat\alpha)$, $\D\hat{l}(\mat\beta)$) by identification of the derivatives from the differentials (see: \todo{appendix})
-% \begin{align*}
-%     \nabla_{\mat\alpha}\hat{l}(\mat\alpha, \mat\beta) &=
-%         \sum_{i = 1}^n (\t{\vec(\mat{f}_{y_i})}\kron\t{\mat{r}_i}\widehat{\mat\Delta}^{-1}) (\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) (\mat{I}_{r q}\kron\vec \mat\beta),
-%         {\color{gray}\qquad(q\times r)} \\
-%     \nabla_{\mat\beta}\hat{l}(\mat\alpha, \mat\beta)  &=
-%         \sum_{i = 1}^n (\t{\vec(\mat{f}_{y_i})}\kron\t{\mat{r}_i}\widehat{\mat\Delta}^{-1}) (\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) (\vec \mat\alpha\kron \mat{I}_{k p}).
-%         {\color{gray}\qquad(p\times k)}
-% \end{align*}
-% These quantities are very verbose as well as completely unusable for an implementation. By detailed analysis of the gradients we see that the main parts are only element permutations with a high sparsity. By defining the following compact matrix
-% \begin{equation}\label{eq:permTransResponse}
-%     \mat G = \vec^{-1}_{q r}\bigg(\Big( \sum_{j = 1}^n \vec\mat{f}_{y_j}\otimes \widehat{\mat\Delta}^{-1}\mat{r}_j \Big)_{\pi(i)}\bigg)_{i = 1}^{p q k r}{\color{gray}\qquad(q r \times p k)}
-% \end{equation}
-% with $\pi$ being a permutation of $p q k r$ elements corresponding to permuting the axis of a 4D tensor of dimensions $p\times q\times k\times r$ by $(2, 4, 1, 3)$. As a generalization of transposition this leads to a rearrangement of the elements corresponding to the permuted 4D tensor with dimensions $q\times r\times p\times k$ which is then vectorized and reshaped into a matrix of dimensions $q r \times p k$. With $\mat G$ the gradients simplify to \todo{validate this mathematically}
-% \begin{align*}
-%     \nabla_{\mat\alpha}\hat{l}(\mat\alpha, \mat\beta) &=
-%         \vec_{q}^{-1}(\mat{G}\vec{\mat\beta}),
-%         {\color{gray}\qquad(q\times r)} \\
-%     \nabla_{\mat\beta}\hat{l}(\mat\alpha, \mat\beta)  &=
-%         \vec_{p}^{-1}(\t{\mat{G}}\vec{\mat\alpha}).
-%         {\color{gray}\qquad(p\times k)}
-% \end{align*}
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\section{Kronecker Covariance Structure Model}\label{sec:kron_cov}
-% \section{Kronecker Covariance Structure}\label{sec:kron_cov}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-Let $\ten{X}, \ten{F}$ be order $r$ tensors of dimensions $p_1\times ... \times p_r$ and $q_1\times ... \times q_r$, respectively. We assume the population model
-\begin{align*}
-    \ten{X} &= \ten{\mu} + \ten{F}\times\{ \mat{\alpha}_1, ..., \mat{\alpha}_r \} + \ten{\epsilon} \\
-        &= \ten{\mu} + \ten{F}\times_{j\in[r]}\mat{\alpha}_j + \ten{\epsilon}
-\end{align*}
-where the error tensor $\epsilon$ is of the same order and dimensions as $\ten{X}$. The distribution of the error tensor is assumed to be mean zero tensor distributed $\ten{\epsilon}\sim\mathcal{TN}(0, \mat{\Delta}_1, ..., \mat{\Delta}_r)$ for symmetric, positive definite covariance matrices $\mat{\Delta}_j$, $j = 1, ..., r$.
-
-Given $i = 1, ..., n$ i.i.d. observations $(\ten{X}_i, \ten{F}_i)$, the sample model analog is
-\begin{align}
-    \ten{X} &= \ten{\mu} + \ten{F}\times\{ \mat{\alpha}_1, ..., \mat{\alpha}_r, \mat{I}_n \} + \ten{\epsilon} \nonumber \\
-        &= \ten{\mu} + \ten{F}\times_{j\in[r]}\mat{\alpha}_j + \ten{\epsilon} \label{eq:sample_model}
-\end{align}
-where the model tensors $\ten{X}, \ten{F}$ collect all observations on an additional sample axis in the last mode, making them tensors of order $r + 1$. Meaning that $\ten{X}$, $\ten{\mu}$ and $\ten{\epsilon}$ have dimensions $p_1\times ...\times p_r\times n$ and $\ten{F}$ is of dimensions $q_1\times ...\times q_r\times n$. The mean tensor $\ten{\mu}$ replicates its entries $\ten{\mu}_i = \ten{\mu}_1$, $i = 1, ..., n$. Let the estimated residual tensor be
-\begin{displaymath}
-    \ten{R} = \ten{X} - \ten{\mu} - \ten{F}\times_{j\in[r]}\mat{\alpha}_j.
-\end{displaymath}
-
-In the following we assume w.l.o.g. that that the mean tensor $\ten{\mu} = 0$.
-
-\begin{table}[!htp]
-    \centering
-    \begin{minipage}{0.8\textwidth}
-    \centering
-    \begin{tabular}{l l l}
-        $\ten X, \ten\mu, \ten R, \ten\epsilon$     & $p_1\times ...\times p_r$ & {\small\color{gray} (Population Level)} \\
-        $\ten X, \ten\mu, \ten R, \ten\epsilon$     & $p_1\times ...\times p_r\times n$ & {\small\color{gray} (Sample Level)} \\
-        $\ten{F}$     & $q_1\times ...\times q_r$ & {\small\color{gray} (Population Level)} \\
-        $\ten{F}$     & $q_1\times ...\times q_r\times n$ & {\small\color{gray} (Sample Level)} \\
-        $\mat{\alpha}_j$                                & $p_j\times q_j$ & $j = 1, ..., r$ \\
-        $\mat\Delta_j$                              & $q\times q$ & $j = 1, ..., r$
-    \end{tabular}
-    \caption{\label{tab:dimensions}Summary listing of dimensions.}
-    \end{minipage}
-\end{table}
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\subsection{Least Squares estimates}\label{sec:ls}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-The least squares estimates for $\mat{\alpha}_j$, $j = 1, ..., r$ given $n$ i.i.d. observations $(\ten{X}_i, \ten{F}_i)$ are the solution to the minimization problem
-\begin{displaymath}
-    \widehat{\mat{\alpha}}_1, ..., \widehat{\mat{\alpha}}_r
-        = \argmin_{\mat{\alpha}_1, ..., \mat{\alpha}_r} \| \ten{X} - \ten{F}\times_{j\in[r]}\mat{\alpha}_j \|_F^2.
-\end{displaymath}
-With the identities $\|\ten{A}\|_F^2 = \tr(\ten{A}_{(j)}\t{\ten{A}_{(j)}})$ and $(\ten{A}\times_j\mat{B})_{(j)} = \mat{B}\ten{A}_{(j)}$ for any $j$ it followds that the differential of the Frobenius norm is equal to
-\begin{align*}
-    \d \| \ten{X} - \ten{F}\times_{j\in[r]}\mat{\alpha}_j \|_F^2
-        &= -2 \sum_{j = 1}^k \tr\Big( (\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k\times_j\d\mat{\alpha}_j)_{(j)}\t{(\ten{X} - \ten{F}\times_{k\in[r]}\mat{\alpha}_k)_{(j)}} \Big) \\
-        &= -2 \sum_{j = 1}^k \tr\Big( (\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}\t{(\ten{X} - \ten{F}\times_{k\in[r]}\mat{\alpha}_k)_{(j)}}\d\mat{\alpha}_j \Big).
-\end{align*}
-Equating to zero for each differential leads to normal equations for the $\mat{\alpha}_j$ parameter matrices
-\begin{displaymath}
-    (\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}\t{\ten{X}_{(j)}} = (\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}\t{(\ten{F}\times_{k\in[r]}\mat{\alpha}_k)_{(j)}}, \qquad j = 1, ..., r
-\end{displaymath}
-where the normal equations for $\mat{\alpha}_j$ depend on all the other $\mat{\alpha}_k$. Solving for $\mat{\alpha}_j$ gives a system of equations
-\begin{equation}\label{eq:ls_est_alphas}
-    \widehat{\mat{\alpha}}_j = \ten{X}_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}}[(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}}]^{-1}, \qquad j = 1, ..., r.
-\end{equation}
-
-\begin{example}[Vector Valued LS ($r = 1$)]\label{ex:ls_vector_case}
-    Considering the vector valued case ($r = 1$), then the sample tensors $\ten{F} = \ten{F}_{(1)} = \t{\mat{F}}$ and $\ten{X} = \ten{X}_{(1)} = \t{\mat{X}}$ which are both matrices of dimensions $n\times p$ and $n\times q$, respectively. The LS estimate for the single parameter matrix $\mat{\alpha} = \mat{\alpha}_1$ is
-    \begin{displaymath}
-        \widehat{\mat{\alpha}}
-            = \ten{X}_{(1)}\t{(\ten{F}\times_{k\in\emptyset}\widehat{\mat{\alpha}}_k)_{(1)}}[(\ten{F}\times_{k\in\emptyset}\widehat{\mat{\alpha}}_k)_{(1)}\t{(\ten{F}\times_{k\in\emptyset}\widehat{\mat{\alpha}}_k)_{(1)}}]^{-1}
-            = \ten{X}_{(1)}\t{\ten{F}_{(1)}}(\ten{F}_{(1)}\t{\ten{F}_{(1)}})^{-1}
-            = \t{\mat{X}}\mat{F}(\t{\mat{F}}\mat{F})^{-1}.
-    \end{displaymath}
-\end{example}
-
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\subsection{MLE estimates}\label{sec:mle}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-By the definition of the Tensor Normal, with $p = \prod_{j = 1}^r p_j$, we get for $n$ i.i.d. observations $\ten{X}_i, \ten{F}_i$ the log-likelihood in terms of the residuals as
-\begin{displaymath}
-    l = -\frac{n p}{2}\log 2\pi
-        -\sum_{j = 1}^r \frac{n p}{2 p_j}\log|\mat{\Delta}_j|
-        -\frac{1}{2}\langle \ten{R}\times_{j\in[r]}\mat{\Delta}_j^{-1}, \ten{R} \rangle.
-\end{displaymath}
-Note that the log-likelihood depends not only on the covariance matrices $\mat{\Delta}_j$, $j = 1, ..., r$ but also on the parameter matrices $\mat{\alpha}_j$, $j = 1, ..., r$ through the residuals $\ten{R}$ (mean $\mu = 0$ is assumed).
-
-For deriving the MLE estimates we compute the differential of the log-likelihood given the data as
-\begin{displaymath}
-    \d l =
-        -\sum_{j = 1}^r \frac{n p}{2 p_j}\d\log|{\mat{\Delta}}_j|
-        -\frac{1}{2}\sum_{j = 1}^r\langle {\ten{R}}\times_{k\in[r]\backslash j}{\mat{\Delta}}_k^{-1}\times_j\d{\mat{\Delta}}^{-1}_j, \ten{R} \rangle
-        -\langle {\ten{R}}\times_{j\in[r]}{\mat{\Delta}}_j^{-1}, \d{\ten{R}} \rangle.
-\end{displaymath}
-Using $\d\log|\mat{A}| = \tr(\mat{A}^{-1}\d\mat{A})$ and $\d\mat{A}^{-1} = -\mat{A}^{-1}(\d\mat{A})\mat{A}^{-1}$ as well as $\langle\ten{A}, \ten{B}\rangle = \tr(\ten{A}_{(j)}\t{\ten{B}_{(j)}})$ for any $j = 1, ..., r$ we get the differential of the log-likelihood as
-\begin{align*}
-    \d l
-    &=
-        -\sum_{j = 1}^r \frac{n p}{2 p_j}\tr({\mat{\Delta}}_j^{-1}\d{\mat{\Delta}}_j)
-        -\frac{1}{2}\sum_{j = 1}^r\tr\!\Big((\d{{\mat{\Delta}}}^{-1}_j)({\ten{R}}\times_{k\in[r]\backslash j}{{\mat{\Delta}}}_k^{-1})_{(j)}\t{\ten{R}_{(j)}}\Big)
-        -\langle {\ten{R}}\times_{j\in[r]}{{\mat{\Delta}}}_j^{-1}, \d{\ten{R}} \rangle \\
-    &=
-        -\frac{1}{2}\sum_{j = 1}^r \tr\left(\Big(
-            \frac{n p}{p_j}\mat{I}_{p_j} - {\mat{\Delta}}_j^{-1}(\ten{R}\times_{k\in[r]\backslash j}{{\mat{\Delta}}}_k^{-1})_{(j)}\t{\ten{R}_{(j)}}
-        \right){\mat{\Delta}}_j^{-1}\d{\mat{\Delta}}_j\Big)
-        -\langle \ten{R}\times_{j\in[r]}{{\mat{\Delta}}}_j^{-1}, \d\ten{R} \rangle.
-\end{align*}
-With $\ten{R}$ not dependent on the $\mat{\Delta}_j$'s we get the covariance MLE estimates
-\begin{equation}\label{eq:mle_est_Deltas}
-    \widehat{\mat{\Delta}}_j = \frac{p_j}{n p}(\ten{R}\times_{k\in[r]\backslash j}{\widehat{\mat{\Delta}}}_k^{-1})_{(j)}\t{\ten{R}_{(j)}}
-        \qquad{\color{gray}p_j\times p_j}, \qquad j = 1, ..., r.
-\end{equation}
-At the same time with the same argument the gradients with respect to the covariance matrices are given by
-\begin{align*}
-    \nabla_{{\mat{\Delta}}_j}l &= \frac{1}{2}{\mat{\Delta}}_j^{-1}\big(
-        \ten{R}_{(j)}\t{(\ten{R}\times_{k\in[r]\backslash j}{{\mat{\Delta}}}_k^{-1})_{(j)}}{\mat{\Delta}}_j^{-1} - \frac{n p}{p_j}\mat{I}_{p_j}
-    \big) \\
-        &= \frac{1}{2}{\mat{\Delta}}_j^{-1}\big(
-            \ten{R}_{(j)}\t{(\ten{R}\times_{k\in[r]}{{\mat{\Delta}}}_k^{-1})_{(j)}} - \frac{n p}{p_j}\mat{I}_{p_j}
-        \big)
-        \qquad{\color{gray}p_j\times p_j} && j = 1, ..., r.
-\end{align*}
-The remaining part is
-\begin{align*}
-    \langle \ten{R}\times_{j\in[r]}{\widehat{\mat{\Delta}}}_j^{-1}, \d\ten{R} \rangle
-        &= -\sum_{j = 1}^r \langle \ten{R}\times_{k\in[r]}{\widehat{\mat{\Delta}}}_k^{-1}, \ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k\times_j\d\widehat{\mat{\alpha}}_j \rangle \\
-        &= -\sum_{j = 1}^r \tr\big( (\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}\t{(\ten{R}\times_{k\in[r]}{\widehat{\mat{\Delta}}}_k^{-1})_{(j)}}\d\widehat{\mat{\alpha}}_j \big).
-\end{align*}
-Through that the gradient for all the parameter matrices is
-\begin{align*}
-    \nabla_{\mat{\alpha}_j}l &= (\ten{R}\times_{k\in[r]}{{\mat{\Delta}}}_k^{-1})_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}{\mat{\alpha}}_k)_{(j)}}
-        \qquad{\color{gray}p_j\times q_j} && j = 1, ..., r.
-\end{align*}
-By equating the gradients to zero and expanding $\ten{R}$ we get a system of equations
-\begin{gather*}
-    0\overset{!}{=} \nabla_{\mat{\alpha}_j} l
-    = ((\ten{X}-\ten{F}\times_{l\in[r]}\mat{\alpha}_l)\times_{k\in[r]}{\mat{\Delta}}_k^{-1})_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}} \\
-    %
-    (\ten{X}\times_{k\in[r]}{\mat{\Delta}}_k^{-1})_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}} = (\ten{F}\times_{k\in[r]}\mat{\Delta}_k^{-1}\mat{\alpha}_k)_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}}.
-\end{gather*}
-Solving for $\mat{\alpha}_j$ in conjunction with the MLE estimates for the $\mat{\Delta}_j$'s gives a cross dependent system of equations for the $\mat{\alpha}_j$ MLE estimates
-\begin{equation}
-    \widehat{\mat{\alpha}}_j = (\ten{X}\times_{k\in[r]\backslash j}{\widehat{\mat{\Delta}}}_k^{-1})_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}}[(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\Delta}}_k^{-1}\widehat{\mat{\alpha}}_k)_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}}]^{-1} \label{eq:mle_est_alphas}
-\end{equation}
-for $j = 1, ..., r$.
-
-\begin{remark}
-    Note the similarity to the LS estimates but also that they are \emph{not} identical. Its a well known fact that the LS and MLE estimates under the Multivariate Normal model are identical. This seems to be violated, but this is \emph{not} the case because the equivalency only holds for the unstructured case. Both the LS and MLE solutions simplify in the unstructured case ($\ten{X}_i, \ten{F}_i$ are of order $r = 1$, e.g. vector valued) to the same well known solution, compare Example~\ref{ex:ls_vector_case} and Example~\ref{ex:mle_vector_case}.
-\end{remark}
-
-\begin{example}[Vector Valued MLE ($r = 1$)]\label{ex:mle_vector_case}
-    Like in Example~\ref{ex:ls_vector_case} let the observations be vector valued. We get $\ten{F} = \ten{F}_{(1)} = \t{\mat{F}}$ and $\ten{X} = \ten{X}_{(1)} = \t{\mat{X}}$ which are both matrices of dimensions $n\times p$ and $n\times q$, respectively. The estimated residuals are $\ten{R} = \ten{R}_{(1)} = \t{\mat{R}} = \t{(\mat{X} - \mat{F}\t{\widehat{\mat{\alpha}}})}$ with $\mat{\alpha} = \mat{\alpha}_1$ as the single parameters matrix. In this case the tensor MLE estimate for $\mat{\alpha}$ simplifies to its well known form
-    \begin{align*}
-        \widehat{\mat{\alpha}}
-            &= (\ten{X}\times_{k\in\emptyset}{\widehat{\mat{\Delta}}}_k^{-1})_{(1)}\t{(\ten{F}\times_{k\in\emptyset}\widehat{\mat{\alpha}}_k)_{(j)}}[(\ten{F}\times_{k\in\emptyset}\widehat{\mat{\Delta}}_k^{-1}\widehat{\mat{\alpha}}_k)_{(j)}\t{(\ten{F}\times_{k\in\emptyset}\widehat{\mat{\alpha}}_k)_{(j)}}]^{-1} \\
-            &= \ten{X}_{(1)}\t{\ten{F}_{(1)}}(\ten{F}_{(1)}\t{\ten{F}_{(1)}})^{-1}
-             = \t{\mat{X}}\mat{F}(\t{\mat{F}}\mat{F})^{-1}
-    \end{align*}
-    and the estimate for the covariance $\mat{\Delta} = \mat{\Delta}_1$ is simply
-    \begin{align*}
-        \widehat{\mat{\Delta}}
-            &= \frac{1}{n}(\ten{R}\times_{k\in\emptyset}{\widehat{\mat{\Delta}}}_k^{-1})_{(1)}\t{\ten{R}_{(1)}} = \frac{1}{n}\t{\mat{R}}\mat{R}
-                = \frac{1}{n}\t{(\mat{X} - \mat{F}\t{\widehat{\mat{\alpha}}_1})}(\mat{X} - \mat{F}\t{\widehat{\mat{\alpha}}_1}).
-    \end{align*}
-\end{example}
-
-\paragraph{Comparison to the general case:} \todo{There are two main differences, first the general case has a closed form solution for the gradient due to the explicit nature of the MLE estimate of $\widehat{\mat\Delta}$ compared to the mutually dependent MLE estimates $\widehat{\mat\Delta}_1$, $\widehat{\mat\Delta}_2$. On the other hand the general case has dramatically bigger dimensions of the covariance matrix ($p q \times p q$) compared to the two Kronecker components with dimensions $q \times q$ and $p \times p$. This means that in the general case there is a huge performance penalty in the dimensions of $\widehat{\mat\Delta}$ while in the other case an extra estimation is required to determine $\widehat{\mat\Delta}_1$, $\widehat{\mat\Delta}_2$.}
-
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\section{Algorithms}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-A traight forward idea for parameter estimation is to use Gradient Descent. For pure algorithmic speedup, by only changin the update rule but \emph{not} the gradient computation of the objective function, we use Nesterov Accelerated Gradient Descent described in Section~\ref{sec:alg_gradient_descent}. An alternative approach applicable for all the methods is to resolve the cross dependence in the estimator equation systems by assuming all the other estimators to be fixed. This leads to an artificialy created closed form solution for the current estimate which is computed according the closed form solution. By cyclic iterating through all the parameters and iterating this process till convergence we get an alternative method as described in Section~\ref{sec:alg_iterative_updating}. In both cases initial estimates are needed for starting the iterative process which is the subject of Section~\ref{sec:alg_init}.
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\subsection{Nesterov Accelerated Gradient Descent}\label{sec:alg_gradient_descent}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-In Section~\ref{sec:kron_cov} we derived for different objective functions, meaning parameterized functions as minimization target, the gradients. In Section~\ref{sec:ls}  the objective function is the Frobenius norm of the estimated residuals for solving the Least Squares problem, then in Section~\ref{sec:mle} its the log-likelihood to find the MLE estimates and in Section~\ref{sec:approx} we had a pseduo log-likelihood. Regardles of which estimates we want to find, denote with $l$ the minimization objective corresponding to the desired minimization problem with parameters $\mat{\Theta}$ collecting all the parameters of the objective. The classic gradient descent algorithm starts with initial values $\mat{\Theta}^{(0)}$, see Section~\ref{sec:alg_init}, and applies gradient updates with a given learning rate $\delta > 0$ untill convergence. The algorithm used is an extention of the classic Gradient Descent algorithm namely Nesterov Accelerated Gradient Descent. This algorithm performs similar to Gradient Descent gradient updates but before evaluation of the gradient an extrapolation of the current position into the previous step direction is performed. Furthermore, an internal line search loop is used to determin an appropriate step size. See Algorithm~\ref{alg:gd} for the case of joint parameter matrices $\widehat{\mat{\alpha}}_1, ..., \widehat{\mat{\alpha}}_r$ and covariances $\widehat{\mat{\Delta}}_1, ..., \widehat{\mat{\Delta}}_r$ estimation. In case that the parameter matrices and the covariances are \emph{not} estimated together, like in the LS estimation, the parameter vector $\mat{\Theta}$ consists only of the parameter matrices $\mat{\alpha}_j$ (which is the only difference) and at the end of the algorithm the estimated parameter matrices can be used for estimation of the covariances.
-
-\todo{more details, better explanation, higher/lower abstraction with respect to the different methods?!}
-
-\begin{algorithm}[ht]
-    \caption{\label{alg:gd}Nesterov Accelerated Gradient Descent}
-    \begin{algorithmic}[1]
-        \State Arguments: Order $r + 1$ tensors $\ten{X}$, $\ten{F}$
-        \State Initialize: $\mat{\Theta}^{(0)} = (\mat{\alpha}^{(0)}_1, ..., \mat{\alpha}^{(0)}_r, \mat{\Delta}_1^{(0)}, ..., \mat{\Delta}_r^{(0)})$, $0 < c, \delta^{(1)}$ and $0 < \gamma < 1$ \Comment{See Section~\ref{sec:alg_init}}
-        \\
-        \State $t \leftarrow 1$
-            \Comment{step counter}
-        \State $\mat{\Theta}^{(1)} \leftarrow \mat{\Theta}^{(0)}$
-            \Comment{artificial first step}
-        \State $(m^{(0)}, m^{(1)}) \leftarrow (0, 1)$
-            \Comment{momentum extrapolation weights}
-        \\
-        \Repeat \Comment{repeat untill convergence}
-            \State $\mat{M} \leftarrow \mat{\Theta}^{(t)} + \frac{m^{(t - 1)} - 1}{m^{(t)}}(\mat{\Theta}^{(t)} - \mat{\Theta}^{(t - 1)})$ \Comment{momentum extrapolation}
-            \For{$\delta = \gamma^{-1}\delta^{(t)}, \delta^{(t)}, \gamma\delta^{(t)}, \gamma^2\delta^{(t)}, ...$} \Comment{Line Search}
-                \State $\mat{\Theta}_{\text{temp}} \leftarrow \mat{M} + \delta \nabla_{\mat{\Theta}} l(\mat{M})$
-                \If{$l(\mat{\Theta}_{\text{temp}}) \leq l(\mat{\Theta}^{(t - 1)}) - c \delta \|\nabla_{\mat{\Theta}} l(\mat{M})\|_F^2$} \Comment{Armijo Condition}
-                    \State $\mat{\Theta}^{(t + 1)} \leftarrow \mat{\Theta}_{\text{temp}}$
-                    \State $\delta^{(t + 1)} \leftarrow \delta$
-                    \Break
-                \EndIf
-            \EndFor
-            \State $m^{(t + 1)} \leftarrow \frac{1 + \sqrt{1 + (2 m^{(t)})^2}}{2}$ \Comment{update extrapolation weights}
-            \State $t \leftarrow t + 1$
-        \Until converged
-    \end{algorithmic}
-\end{algorithm}
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\subsection{Iterative Cyclic Updating}\label{sec:alg_iterative_updating}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-For both the LS and MLE estimates we have derived equation systems, namely \eqref{eq:ls_est_alphas}, \eqref{eq:mle_est_alphas} and \eqref{eq:mle_est_Deltas}, for the estimates which are cross dependent. The idea of Iterative Cyclic Updating is simply to take the cross dependent equations and assume the unknown quantities given from the previous iteration. Then the cross dependency reduces to a closed form solution depending on the known previous iterates. This iterative prodess is repeated till convergence, see Algorithm~\ref{alg:iterative_updating} for the case of LS.
-
-\begin{algorithm}[ht]
-    \caption{\label{alg:iterative_updating}Iterative Cyclic Updating for LS estimates}
-    \begin{algorithmic}[1]
-        \State Arguments: Order $r + 1$ tensors $\ten{X}$, $\ten{F}$
-        \State Initialize: $\mat{\alpha}^{(0)}_1, ..., \mat{\alpha}^{(0)}_r$ \Comment{See Section~\ref{sec:alg_init}}
-        \\
-        \State $t \leftarrow 0$
-        \Repeat\Comment{until convergence}
-            \For{$j = 1$ to $r$}\Comment{For each mode}
-                \State $\widehat{\mat{\alpha}}^{(t+1)}_j \leftarrow \ten{X}_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}^{(t)}_k)_{(j)}}[(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}^{(t)}_k)_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}^{(t)}_k)_{(j)}}]^{-1}$\label{alg:iterative_updating:update} \\
-                \Comment{Solve for $\widehat{\mat{\alpha}}_j$ given all others}
-            \EndFor
-        \Until{$\sum_{j = 1, ..., r} \| \widehat{\mat{\alpha}}^{(t)}_j - \widehat{\mat{\alpha}}^{(t - 1)}_j \|_F^2 < \epsilon^2$}
-        \Comment{convergence condition (one example)}
-    \end{algorithmic}
-\end{algorithm}
-
-A refined version would be to always take the newest estimates. In the case of Algorithm~\ref{alg:iterative_updating} this means that when computing $\widehat{\mat{\alpha}}_j^{(t + 1)}$ we use $\widehat{\mat{\alpha}}_k^{(t + 1)}$ for $k = 1, ..., j - 1$ and $\widehat{\mat{\alpha}}_k^{(t)}$ for $k = j + 1, ..., r$ in line \ref{alg:iterative_updating:update} instead.
-
-Furthermore, there is also the idea of randomizing the updating order which seems improve convergence and kind of stabalizes the algorithm.
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\subsection{Initial Values}\label{sec:alg_init}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-Currently there are two approaches for the initial value estimates required by Algorithm~\ref{alg:gd} and Algorithm~\ref{alg:iterative_updating}. First there is the \emph{Van Loan and Pitsianis} (VLP) method shortly described in Section~\ref{sec:VLP}. The second ansatz is to perform an \emph{Higher Order Principal Component Analysis} (HOPCA) and take the required amount of eigvectors as initial values described in Section~\ref{sec:HOPCA}.
-
-\subsubsection{Van Loan and Pitsianis (VLP)}\label{sec:VLP}
-The VLP approach builds on an least squares solution on an rank 1 Kronecker product decomposition \cite{ApproxKron-VanLoanPitsianis1993}. To use the VLP decomposition we solve the vectorized version of model \ref{eq:sample_model} (assuming $\ten{\mu} = 0$)
-\begin{displaymath}
-    \ten{X}_{(r+1)} = \ten{F}_{(r+1)} \bigotimes_{j = r}^1 \t{\mat{\alpha}}_j + \ten{\epsilon}_{(r+1)}.
-\end{displaymath}
-Let $\mat{B} = \bigotimes_{j = r}^1 \t{\mat{\alpha}}_j$, then an estimate of $\mat{B}$ without considering the Kronecker structure is given by the usual least squres solution
-\begin{displaymath}
-    \widehat{\mat{B}} = \t{\ten{X}_{(r+1)}}\ten{F}_{(r+1)}(\t{\ten{F}_{(r+1)}}\ten{F}_{(r+1)})^{-1}
-\end{displaymath}
-Then approximate the $\widehat{\mat{B}}$ using the VLP rank 1 Kronecker decomposition (iteratively if rank $r > 2$, e.g. when there are more than two $\widehat{\mat{\alpha}}_j$'s) as
-\begin{displaymath}
-    \widehat{\mat{\alpha}}_1, ..., \widehat{\mat{\alpha}}_r = \argmin_{\mat{\alpha}_1, ..., \mat{\alpha}_r} \| \widehat{\mat{B}} - \bigotimes_{j = r}^1 \t{\mat{\alpha}}_j \|_F.
-\end{displaymath}
-
-\subsubsection{Higher Order Principal Component Analysis (HOPCA)}\label{sec:HOPCA}
-The \emph{Higher Order Principal Component Analysis} a simple estimation method for estimationg Principal Components for each mode of a dataset consisting of tensor valued observations as illustrated in Algorithm~\ref{alg:HOPCA}.
-
-\begin{algorithm}[ht]
-    \caption{\label{alg:HOPCA}Higher Order Principal Component Analysis}
-    \begin{algorithmic}[1]
-        \State Arguments: Order $r + 1$ tensor $\ten{X}$ of dimensions $p = (p_1, ..., p_r, n)$, $q = (q_1, ..., q_r)$ number of components for each mode
-        \State Returns: $\mat{A}_j$ PC matrices for each mode of dimensions $p_j\times q_j$, $j = 1, ..., r$.
-        \\
-        \For{$j = 1$ to $r$}\Comment{For each mode}
-            \State $\mat{A}_j \leftarrow$ the first $q_j$ eigenvectors of $\ten{X}_{(j)}\t{\ten{X}_{(j)}}$
-        \EndFor
-    \end{algorithmic}
-\end{algorithm}
-
-
-\subsection{Estimation Algorithms}
-By using initial values from Section~\ref{sec:alg_init} for initial values for the Gradiet Descent algorithm from Section~\ref{sec:alg_gradient_descent} or the Iterative Updating algorithm from Section~\ref{sec:alg_iterative_updating} the complete sequence of algorithm for estimation is combined as ilustrated in Figure~\ref{fig:algo_dependency}.
-
-\begin{figure}[!hp]
-    \centering
-    \begin{tikzpicture}[>=latex]
-        \node[draw, text centered, text width = 3cm, gray] (vlp) at (-2, 0) {VLP \\ Section~\ref{sec:alg_init}};
-        \node[draw, text centered, text width = 3cm] (hopca) at (2, 0) {HOPCA \\ Section~\ref{sec:alg_init}};
-        \node[draw, text centered, text width = 3cm] (ls) at (0, -1.5) {LS \\ Section~\ref{sec:ls},~\ref{sec:alg_gradient_descent},~\ref{sec:alg_iterative_updating}};
-        \node[draw, text centered, text width = 3cm] (mle) at (0, -3) {MLE \\ Section~\ref{sec:mle},~\ref{sec:alg_gradient_descent},~\ref{sec:alg_iterative_updating}};
-        % \node[draw, text centered, text width = 3cm] (approx) at (2, -3) {pseudo MLE \\ Section~\ref{sec:mle},~\ref{sec:alg_gradient_descent},\todo{}};
-        \draw[->, gray] (vlp) -- (ls);
-        \draw[->] (hopca) -- (ls);
-        \draw[->] (ls) -- (mle);
-        \draw[->, dotted] (vlp) |- (mle);
-        \draw[->, dotted] (hopca) |- (mle);
-        % \draw[->] (ls) -- (approx);
-        % \draw[->, dotted] (vlp) -- (mle);
-        % \draw[->, dotted] (hopca) -- (approx);
-        % \draw[dotted] ($(vlp.south) + (0, -0.25)$) -- ($(hopca.south) + (0, -0.25)$);
-    \end{tikzpicture}
-    \caption{\label{fig:algo_dependency}Algorithmic dependencies.}
-\end{figure}
-
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\section{Simulations}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\subsection{Metrics}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-For measring the performance of the different methods in the simulations we employ different metrics to compare the estimates against the ``true'' parameters used to generate the simulation data. The parameters in the sample model \ref{eq:sample_model} are the parameter matrices $\widehat{\mat{\alpha}}_j$ and the covariance matrices $\widehat{\mat{\Delta}}_j$. Given the ``true'' parameters $\mat{\alpha}_j, \mat{\Delta}_j$, $j = 1, ..., r$ used to generate the simulation data samples $(\ten{X}_i, \ten{F}_i)$ for $i = 1, ..., n$ we employ a few metrics.
-
-First there is the \emph{maximum subspace distance} as the maximum over the $j = 1, ..., r$ modes of the Frobenius norms of the subspace projection matrix differences;
-\begin{displaymath}
-    d_{\mat{\alpha}}((\mat{\alpha}_1, ..., \mat{\alpha}_r), (\widehat{\mat{\alpha}}_1, ... ,\widehat{\mat{\alpha}}_r)) = \max_{j = 1, ..., r} \| \mat{P}_{\mat{\alpha}_j} - \mat{P}_{\widehat{\mat{\alpha}}_j} \|_F
-\end{displaymath}
-where $\mat{P}_{\mat{A}} = \mat{A}(\t{\mat{A}}\mat{A})^{-1}\t{\mat{A}}$ is the projection onto $\Span(\mat{A})$.
-
-Another interesting distance is between the Kronecker products of the parameter matrices $\mat{\beta} = \bigotimes_{j = r}^1 \mat{\alpha}_j$ and its estimate $\widehat{\mat{\beta}} = \bigotimes_{j = r}^1 \widehat{\mat{\alpha}}_j$. They correspond to the parameters of the vectorized model under the Kronecker product constraint. We use again the Frobenius norm of the projection differences
-\begin{displaymath}
-    d_{\mat{\beta}} = \|\mat{P}_{\mat{\beta}} - \mat{P}_{\widehat{\mat{\beta}}}\|_F.
-\end{displaymath}
-This might get very expensive to compute directly. But fortunetly this can be drastically simplified, in the sense of the size of the involved matrices. Therefore, we use some properties of projections as well as the Kronecker product which allow to rewrite
-\begin{align*}
-    \|\mat{P}_{\mat{\beta}} - \mat{P}_{\widehat{\mat{\beta}}}\|_F^2
-        &= \tr((\mat{P}_{\mat{\beta}} - \mat{P}_{\widehat{\mat{\beta}}})\t{(\mat{P}_{\mat{\beta}} - \mat{P}_{\widehat{\mat{\beta}}})}) \\
-        &= \tr(\mat{P}_{\mat{\beta}}) - 2\tr(\mat{P}_{\mat{\beta}}\mat{P}_{\widehat{\mat{\beta}}}) + \tr(\mat{P}_{\widehat{\mat{\beta}}}) \\
-        &= \prod_{j = 1}^r\tr(\mat{P}_{\mat{\alpha}_j}) - 2\prod_{j = 1}^r\tr(\mat{P}_{\mat{\alpha}_j}\mat{P}_{\widehat{\mat{\alpha}}_j}) + \prod_{j = 1}^r\tr(\mat{P}_{\widehat{\mat{\alpha}}_j}) \\
-        &= \prod_{j = 1}^r\rank(\mat{\alpha}_j) - 2\prod_{j = 1}^r\tr(\mat{P}_{\mat{\alpha}_j}\mat{P}_{\widehat{\mat{\alpha}}_j}) + \prod_{j = 1}^r\rank(\widehat{\mat{\alpha}}_j).
-\end{align*}
-This formulation allows for substantialy more efficient implementation which can lead to a drastic speedup in the simulations cause the computation of $d_{\mat{\beta}}$ is its raw form can take a significant amount of time compared to the estimation. In addition, the memory footprint is also reduced drastically.
-
-Finally, for validating the estimation quality of the covariances $\mat{\Delta}_j$ the Frobenius norm of the ``true'' covariance $\mat{\Delta} = \bigotimes_{j = r}^1 \mat{\Delta}_j$ to its estimate $\widehat{\mat{\Delta}} = \bigotimes_{j = r}^1 \widehat{\mat{\Delta}}_j$ is computed.
-\begin{displaymath}
-    d_{\mat{\Delta}}(\mat{\Delta}, \widehat{\mat{\Delta}}) = \|\mat{\Delta} - \widehat{\mat{\Delta}}\|_F.
-\end{displaymath}
-Again, this can be rewritten in an computationly easier form
-\begin{align*}
-    \|\mat{\Delta} - \widehat{\mat{\Delta}}\|_F^2
-        &= \tr(\mat{\Delta} - \widehat{\mat{\Delta}})\t{(\mat{\Delta} - \widehat{\mat{\Delta}})} \\
-        &= \tr\mat{\Delta}^2 - 2 \tr\mat{\Delta}\widehat{\mat{\Delta}} + \tr\widehat{\mat{\Delta}}^2 \\
-        &= \prod_{j = 1}^r\tr\mat{\Delta_j}^2 - 2 \prod_{j = 1}^r\tr\mat{\Delta}_j\widehat{\mat{\Delta}}_j + \prod_{j = 1}^r\tr\widehat{\mat{\Delta}}_j^2.
-\end{align*}
-There are also cases where the ``true'' covariance $\mat{\Delta}$ is \emph{not} a Kronecker product, in this situation the distance needs to be computed directly as $\|\mat{\Delta} - \widehat{\mat{\Delta}}\|_F$.
-
-Furthermore, we report the negative log-likelihood without the constraint term as \emph{loss} and the \emph{mean squared error} MSE
-\begin{displaymath}
-    \text{MSE} = \frac{1}{n p} \|\ten{R}\|_F^2.
-\end{displaymath}
-
-% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-% \subsection{Simulations}
-% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-% In this section we present a few simulation studies.
-
-% % \subsubsection{Sim 1}
-% % For $n = 200$ observations, with 
-
-% % \begin{figure}
-% %     \centering
-% %     \includegraphics[width = 0.6\textwidth]{sim2.logs.png}
-% % \end{figure}
-
-% % The first example (which by it self is \emph{not} exemplary) is the estimation with parameters $n = 200$, $p = 11$, $q = 5$, $k = 14$ and $r = 9$. The ``true'' matrices $\mat\alpha$, $\mat\beta$ generated by sampling there elements i.i.d. standard normal like the responses $y$. Then, for each observation, $\mat{f}_y$ is computed as $\sin(s_{i, j} y + o_{i j})$ \todo{ properly describe} to fill the elements of $\mat{f}_y$. Then the $\mat{X}$'s are samples as
-% % \begin{displaymath}
-% %     \mat{X} = \mat{\beta}\mat{f}_y \t{\mat{\alpha}} + \mat{\epsilon}, \qquad \vec{\mat{\epsilon}} \sim \mathbb{N}_{p q}(\mat{0}, \mat{\Delta})
-% % \end{displaymath}
-% % where $\mat{\Delta}_{i j} = 0.5^{|i - j|}$ for $i, j = 1, ..., p q$.
-
-% \subsubsection{Sim 4 (multi-variable polynomial interactions)}
-% Lets considure a polynomial model with $r$ variables. Each variable by itself is described by a polynomial of order $p_j - 1$. Let $z_j$ be the variables for the $j$'th axis and let $\mat{\alpha}_j\in\mathbb{R}^{p_j}$, $j = 1, ..., r$ be the parameters for the polynomial of the $j$'th variable (note that the parameters $\mat{\alpha}_j$ are vectors treates by the estimation methods as $p_j\times 1$ matrices).
-% \begin{displaymath}
-%     y = \prod_{j = 1}^r \sum_{k = 1}^{p_j}\mat{\alpha}_{j, k} z_j^{k - 1}
-% \end{displaymath}
-% For the simulation we take a 3D problem ($r = 3$) and quadratic polynomials. For $n = 500$ samples the parameters $\mat{z} = (z_1, z_2, z_3)$ are samples uniformly from the $[-1, 1]^3$ cube as arguments to quadratic polynomials $p_1 = p_2 = p_3 = 3$. The $i$'th scalar response $y_i$ is then computed by above formula which can be written in a multi-linear setting as
-% \begin{displaymath}
-%     y_i = \ten{X}_i\times_{j\in[r]}\t{\mat{\alpha}_j} + \mathcal{N}(0, \sigma^2)
-% \end{displaymath}
-% where the order $r = 3$ tensor $\ten{X}$ consists of all interaction terms of the monoms $x_{j}^{k - 1}$ for $k = 1, ..., p_j$ for each axis $j = 1, ..., r$ ans $\sigma = 0.1$. More explicitly, the vectorized version satisfies
-% \begin{displaymath}
-%     \vec{\ten{X}} = \bigotimes_{j = r}^r \t{(1, z_j, z_j^2, ..., z_j^{p_j-1})}.
-% \end{displaymath}
-
-% This corresponds to the classic polynomial interaction model of the form
-% \begin{displaymath}
-%     y = \t{\vec(\ten{Z})}\mat{\beta} + \mathcal{N}(0, \sigma^2).
-% \end{displaymath}
-% where $\mat{\beta} = \bigotimes_{j = r}^1\mat{\alpha}_j$.
-
-\subsection{EEG Data}
-As an real world example we compair the HO-PIR method against two reference methods, namely LSIR \cite{lsir-PfeifferForzaniBura} and K-PIR \cite{sdr-PfeifferKaplaBura2021}, on the EEG data set. This is a small study including $77$ alcoholic individuals and $45$ control subjects (see: \url{http://kdd.ics.uci.edu/databases/eeg/eeg.data.html}). The data for each subject consisted of a $64\times 256$ matrix, with each column representing a time point and each row a channel. The data were obtained by exposing each individual to visual stimuli and measuring $7$ Simulations voltage values from $64$ electrodes placed on the subjects scalps sampled at $256$ time points (at $256$ Hz for $1$ second). Each individual observation is the mean over $120$ different trial per subject. We first preprocess the data using the HO-PCA Algorithm~\ref{alg:HOPCA} with different PCA's per mode. The results are listed in Table~\ref{tab:eeg_sim}.
-
-
-\begin{table}[!ht]
-    \centering
-    \begin{tabular}{*{2}{l} | *{7}{c}}
-        method       &      npc  & Acc  & AUC (sd)    \\
-        \hline
-        K-PIR (mle)  &    (3, 4) & 0.70 & 0.75 (0.05) \\
-        LSIR         &    (3, 4) & 0.80 & 0.85 (0.04) \\
-        HO-PIR (ls)  &    (3, 4) & 0.80 & 0.85 (0.04) \\ % Algorithm: ICU
-        % HO-PIR (ls,nagd) &    (3, 4) & 0.80 & 0.85 (0.04) \\
-        HO-PIR (mle) &    (3, 4) & 0.80 & 0.85 (0.04) \\ % Algorithm: ICU
-        % HO-PIR (mle,nagd)&    (3, 4) & 0.80 & 0.85 (0.04) \\
-        \hline
-        K-PIR (mle)      &  (15, 15) &  --- & 0.78 (0.04) \\
-        LSIR             &  (15, 15) & 0.72 & 0.81 (0.04) \\
-        HO-PIR (ls)  &  (15, 15) & 0.79 & 0.83 (0.04) \\ % Algorithm: ICU
-        % HO-PIR (ls,nagd) &  (15, 15) & 0.79 & 0.83 (0.04) \\
-        HO-PIR (mle) &  (15, 15) & 0.77 & 0.83 (0.04) \\ % Algorithm: ICU
-        % HO-PIR (mle,nagd)&  (15, 15) & 0.76 & 0.81 (0.04) \\
-        \hline
-        K-PIR (mle)  &  (20, 30) &  --- & 0.78 (0.04) \\
-        LSIR         &  (20, 30) & 0.79 & 0.83 (0.04) \\
-        HO-PIR (ls)  &  (20, 30) & 0.75 & 0.80 (0.05) \\ % Algorithm: ICU
-        % HO-PIR (ls,nagd) &  (20, 30) & 0.75 & 0.80 (0.05) \\
-        HO-PIR (mle) &  (20, 30) & 0.75 & 0.83 (0.04) % \\ % Algorithm: ICU
-        % HO-PIR (mle,nagd)&  (20, 30) & 0.75 & 0.80 (0.05)% \\
-        % %HOPIR(ls,icu)   & (256, 64) & 0.67 & 0.69 (0.05) \\
-    \end{tabular}
-    \caption{\label{tab:eeg_sim}Recreation of the EEG data LOO-CV from \cite{sdr-PfeifferKaplaBura2021} Section~7. The methods HO-PIR (ls) and HO-PIR (mle) use the ICU Algorithm~\ref{alg:iterative_updating}.}
-\end{table}
-
-% % Same as above, but with more content
-% \begin{table}[!ht]
-%     \centering
-%     \begin{tabular}{*{2}{l} | *{7}{c}}
-%         method  &      npc  & Acc  & Err  & FPR  & TPR  & FNR  & TNR  & AUC (sd)    \\
-%         \hline
-%         K-PIR (mle)      &    (3, 4) & 0.70 & 0.30 & 0.60 & 0.87 & 0.13 & 0.40 & 0.75 (0.05) \\
-%         LSIR             &    (3, 4) & 0.80 & 0.20 & 0.36 & 0.88 & 0.12 & 0.64 & 0.85 (0.04) \\
-%         HO-PIR (ls,icu)  &    (3, 4) & 0.80 & 0.20 & 0.33 & 0.87 & 0.13 & 0.67 & 0.85 (0.04) \\
-%         HO-PIR (ls,nagd) &    (3, 4) & 0.80 & 0.20 & 0.33 & 0.87 & 0.13 & 0.67 & 0.85 (0.04) \\
-%         HO-PIR (mle,icu) &    (3, 4) & 0.80 & 0.20 & 0.36 & 0.88 & 0.12 & 0.64 & 0.85 (0.04) \\
-%         HO-PIR (mle,nagd)&    (3, 4) & 0.80 & 0.20 & 0.33 & 0.87 & 0.13 & 0.67 & 0.85 (0.04) \\
-%         \hline
-%         K-PIR (mle)      &  (15, 15) &  --- &  --- &  --- &  --- &  --- &  --- & 0.78 (0.04) \\
-%         LSIR             &  (15, 15) & 0.72 & 0.28 & 0.44 & 0.82 & 0.18 & 0.56 & 0.81 (0.04) \\
-%         HO-PIR (ls,icu)  &  (15, 15) & 0.79 & 0.21 & 0.38 & 0.88 & 0.12 & 0.62 & 0.83 (0.04) \\
-%         HO-PIR (ls,nagd) &  (15, 15) & 0.79 & 0.21 & 0.38 & 0.88 & 0.12 & 0.62 & 0.83 (0.04) \\
-%         HO-PIR (mle,icu) &  (15, 15) & 0.77 & 0.23 & 0.40 & 0.87 & 0.13 & 0.60 & 0.83 (0.04) \\
-%         HO-PIR (mle,nagd)&  (15, 15) & 0.76 & 0.24 & 0.47 & 0.90 & 0.10 & 0.53 & 0.81 (0.04) \\
-%         \hline
-%         K-PIR (mle)      &  (20, 30) &  --- &  --- &  --- &  --- &  --- &  --- & 0.78 (0.04) \\
-%         LSIR             &  (20, 30) & 0.79 & 0.21 & 0.36 & 0.87 & 0.13 & 0.64 & 0.83 (0.04) \\
-%         HO-PIR (ls,icu)  &  (20, 30) & 0.75 & 0.25 & 0.38 & 0.83 & 0.17 & 0.62 & 0.80 (0.05) \\
-%         HO-PIR (ls,nagd) &  (20, 30) & 0.75 & 0.25 & 0.38 & 0.83 & 0.17 & 0.62 & 0.80 (0.05) \\
-%         HO-PIR (mle,icu) &  (20, 30) & 0.75 & 0.25 & 0.40 & 0.83 & 0.17 & 0.60 & 0.83 (0.04) \\
-%         HO-PIR (mle,nagd)&  (20, 30) & 0.75 & 0.25 & 0.42 & 0.86 & 0.14 & 0.58 & 0.80 (0.05)% \\
-%         %HOPIR(ls,icu)   & (256, 64) & 0.67 & 0.33 & 0.53 & 0.79 & 0.21 & 0.47 & 0.69 (0.05) \\
-%     \end{tabular}
-%     \caption{\label{tab:eeg_sim}Recreation of the EEG data LOO-CV from \cite{sdr-PfeifferKaplaBura2021} Section~7.}
-% \end{table}
-
-
-% \begin{table}[!ht]
-%     \centering
-%     % see: https://en.wikibooks.org/wiki/LaTeX/Tables
-%     \begin{tabular}{ll|r@{ }l *{3}{r@{.}l}}
-%           method & init
-%             & \multicolumn{2}{c}{loss}
-%             & \multicolumn{2}{c}{MSE}
-%             & \multicolumn{2}{c}{$\dist(\hat{\mat\alpha}, \mat\alpha)$}
-%             & \multicolumn{2}{c}{$\dist(\hat{\mat\beta}, \mat\beta)$}
-%             \\ \hline
-%             K-PIR (mle) &      & -2642&(1594) & 1&82  (2.714) & 0&248 (0.447) & 0&271 (0.458) \\ % base
-%             new         &  vlp & -2704&(1452) & 1&78  (2.658) & 0&233 (0.438) & 0&260 (0.448) \\
-%             new         &   ls & -3479&  (95) & 0&99  (0.025) & 0&037 (0.017) & 0&035 (0.015) \\
-%             momentum    &  vlp & -2704&(1452) & 1&78  (2.658) & 0&233 (0.438) & 0&260 (0.448) \\
-%             momentum    &   ls & -3479&  (95) & 0&99  (0.025) & 0&037 (0.017) & 0&035 (0.015) \\
-%             approx      &  vlp &  6819&(1995) & 3&99 (12.256) & 0&267 (0.448) & 0&287 (0.457) \\
-%             approx      &   ls &  5457& (163) & 0&99  (0.025) & 0&033 (0.017) & 0&030 (0.012) \\
-%         \end{tabular}
-%     \caption{Mean (standard deviation) for simulated runs of $20$ repititions for the model $\mat{X} = \mat{\beta}\mat{f}_y\t{\mat{\alpha}}$ of dimensions $(p_1, p_2) = (11, 7)$, $(q_1, q_2) = (3, 5)$ with a sample size of $n = 200$. The covariance structure is $\mat{\Delta} = \mat{\Delta}_2\otimes \mat{\Delta}_1$ for $\Delta_i = \text{AR}(\sqrt{0.5})$, $i = 1, 2$. The functions applied to the standard normal response $y$ are $\sin, \cos$ with increasing frequency.}
-% \end{table}
-
-% \begin{figure}
-%     \centering
-%     \includegraphics{loss_Ex01.png}
-% \end{figure}
-% \begin{figure}
-%     \centering
-%     \includegraphics{estimates_Ex01.png}
-% \end{figure}
-% \begin{figure}
-%     \centering
-%     \includegraphics{Delta_Ex01.png}
-% \end{figure}
-% \begin{figure}
-%     \centering
-%     \includegraphics{hist_Ex01.png}
-% \end{figure}
-
-% > print(times, digits = 2)
-%            method       npc elapsed sys.self user.self
-% 1    hopir.ls.icu    (3, 4)   0.003    0.006     0.005
-% 3   hopir.ls.nagd    (3, 4)   0.023    0.000     0.023
-% 2   hopir.mle.icu    (3, 4)   0.020    0.044     0.038
-% 4  hopir.mle.nagd    (3, 4)   0.062    0.125     0.109
-
-% 5    hopir.ls.icu  (15, 15)   0.012    0.061     0.037
-% 7   hopir.ls.nagd  (15, 15)   0.089    0.009     0.082
-% 6   hopir.mle.icu  (15, 15)   0.176    0.860     0.503
-% 8  hopir.mle.nagd  (15, 15)   0.212    0.657     0.478
-% 9    hopir.ls.icu  (20, 30)   0.028    0.142     0.079
-% 11  hopir.ls.nagd  (20, 30)   0.302    0.269     0.347
-% 10  hopir.mle.icu  (20, 30)   0.429    2.043     1.350
-% 12 hopir.mle.nagd  (20, 30)   0.461    0.958     0.828
-
-% 13   hopir.ls.icu (256, 64)   0.946    3.574     2.840
-
-% \begin{table}[!ht]
-%     \centering
-%     \begin{tabular}{*{3}{l} | *{7}{c}}
-%         method      & init &      npc  &  Acc &  Err &  FPR &  TPR &  FNR &  TNR &  AUC (sd) \\ \hline
-%         K-PIR (mle) &      &    (3, 4) & 0.70 & 0.30 & 0.60 & 0.87 & 0.13 & 0.40 & 0.75 (0.047) \\ % base
-%         new         &  vlp &    (3, 4) & 0.70 & 0.30 & 0.60 & 0.87 & 0.13 & 0.40 & 0.75 (0.047) \\
-%         new         &   ls &    (3, 4) & 0.74 & 0.26 & 0.51 & 0.88 & 0.12 & 0.49 & 0.77 (0.045) \\
-%         ls          &      &    (3, 4) & 0.75 & 0.25 & 0.49 & 0.88 & 0.12 & 0.51 & 0.78 (0.044) \\
-%         ls$^*$      &      &    (3, 4) & 0.78 & 0.22 & 0.38 & 0.87 & 0.13 & 0.62 & 0.86 (0.034) \\ %
-%         LSIR        &      &    (3, 4) & 0.80 & 0.20 & 0.36 & 0.88 & 0.12 & 0.64 & 0.85 (0.036) \\
-%         momentum    &  vlp &    (3, 4) & 0.70 & 0.30 & 0.60 & 0.87 & 0.13 & 0.40 & 0.75 (0.047) \\
-%         momentum    &   ls &    (3, 4) & 0.70 & 0.30 & 0.58 & 0.87 & 0.13 & 0.42 & 0.76 (0.046) \\
-%         approx      &  vlp &    (3, 4) & 0.68 & 0.32 & 0.62 & 0.86 & 0.14 & 0.38 & 0.74 (0.048) \\
-%         approx      &   ls &    (3, 4) & 0.73 & 0.27 & 0.53 & 0.88 & 0.12 & 0.47 & 0.78 (0.044) \\ \hline
-%         ls          &      &  (15, 15) & 0.75 & 0.25 & 0.47 & 0.87 & 0.13 & 0.53 & 0.78 (0.044) \\
-%         ls$^*$      &      &  (15, 15) & 0.76 & 0.24 & 0.44 & 0.88 & 0.12 & 0.56 & 0.83 (0.039) \\ %
-%         LSIR        &      &  (15, 15) & 0.72 & 0.28 & 0.44 & 0.82 & 0.18 & 0.56 & 0.81 (0.040) \\
-%         approx      &   ls &  (15, 15) & 0.73 & 0.27 & 0.51 & 0.87 & 0.13 & 0.49 & 0.78 (0.044) \\ \hline
-%         ls          &      &  (20, 30) & 0.75 & 0.25 & 0.47 & 0.87 & 0.13 & 0.53 & 0.78 (0.044) \\
-%         ls$^*$      &      &  (20, 30) & 0.77 & 0.23 & 0.36 & 0.84 & 0.16 & 0.64 & 0.79 (0.045) \\ %
-%         LSIR        &      &  (20, 30) & 0.79 & 0.21 & 0.36 & 0.87 & 0.13 & 0.64 & 0.83 (0.038) \\
-%         approx      &   ls &  (20, 30) & 0.63 & 0.37 & 1.00 & 1.00 & 0.00 & 0.00 & 0.51 (0.053) \\ \hline
-%         HOPCA       &      &           & 0.62 & 0.38 & 1    & 0.99 & 0.01 & 0    & 0.56 (0.053) \\
-%         ls          &      &           & 0.75 & 0.25 & 0.44 & 0.87 & 0.13 & 0.56 & 0.78 (0.044) \\
-%         ls$^*$      &      &           & 0.68 & 0.32 & 0.51 & 0.79 & 0.21 & 0.49 & 0.66 (0.054) \\ %
-%         approx      &   ls &           & 0.75 & 0.25 & 0.44 & 0.87 & 0.13 & 0.56 & 0.78 (0.044) \\
-%     \end{tabular}
-%     \caption{\label{tab:eeg_sim}Recreation of the EEG data LOO-CV from \cite{sdr-PfeifferKaplaBura2021} Section~7, EEG Data and Table~6 with new methods. Column \emph{vlp} stands for the Van Loan and Pitsianis initialization scheme as described in \cite{sdr-PfeifferKaplaBura2021} and \emph{ls} is the approach described above. The column \emph{npc} gives the number of principal component of the $(2D)^2 PCA$ preprocessing. Reduction by $\ten{X}\times_{j\in[r]}\t{\widehat{\mat{\alpha}}_j}$ instread of $^*$ where reduction is $\ten{X}\times_{j\in[r]}\t{\widehat{\mat{\Delta}}_j^{-1}\widehat{\mat{\alpha}}_j}$.}
-% \end{table}
-
-% \begin{table}
-%     \centering
-%     \begin{tabular}{*{3}{l} | *{3}{r@{.}l}}
-%         method   & init &       npc
-%             & \multicolumn{2}{c}{elapsed}
-%             & \multicolumn{2}{c}{sys.self}
-%             & \multicolumn{2}{c}{user.self} \\ \hline
-%         base     &      &    (3, 4) &  0&079 &   0&402 &   0&220 \\
-%         new      &  vlp &    (3, 4) &  0&075 &   0&393 &   0&217 \\
-%         new      &   ls &    (3, 4) &  0&218 &   0&243 &   0&305 \\
-%         ls       &      &    (3, 4) &  0&003 &   0&006 &   0&006 \\
-%         LSIR     &      &    (3, 4) &  0&002 &   0&000 &   0&002 \\
-%         momentum &  vlp &    (3, 4) &  0&143 &   0&595 &   0&359 \\
-%         momentum &   ls &    (3, 4) &  0&297 &   0&252 &   0&385 \\
-%         approx   &  vlp &    (3, 4) &  0&044 &   0&240 &   0&152 \\
-%         approx   &   ls &    (3, 4) &  0&066 &   0&144 &   0&121 \\ \hline
-%         ls       &      &  (15, 15) &  0&012 &   0&059 &   0&034 \\
-%         LSIR     &      &  (15, 15) &  0&010 &   0&050 &   0&031 \\
-%         approx   &   ls &  (15, 15) &  0&813 &   3&911 &   2&325 \\ \hline
-%         ls       &      &  (20, 30) &  0&028 &   0&129 &   0&080 \\
-%         LSIR     &      &  (20, 30) &  0&030 &   0&142 &   0&100 \\
-%         approx   &   ls &  (20, 30) &  2&110 &  10&111 &   6&290 \\ \hline
-%         HOPCA    &      &           &  0&24  &   0&37  &   0&43  \\
-%         ls       &      &           &  1&252 &   6&215 &   3&681 \\
-%         approx   &   ls &           & 36&754 & 141&028 & 147&490 \\
-%     \end{tabular}
-%     \caption{\label{tab:eeg_time}Like Table~\ref{tab:eeg_sim} but reports the mean run-time.}
-% \end{table}
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%%                              Bib and Index                               %%%
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-% \printindex
-\nocite{*}
-\printbibliography[heading=bibintoc,title={References}]
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%%                                 Appendix                                 %%%
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\newpage
-
-\appendix
-\section{Matrix Differential Rules}
-Let $\mat A$ be a square matrix (and invertible if needed) and $|.|$ stands for the determinant
-\begin{align*}
-    \d\log\mat A &= \frac{1}{|\mat A|}\d\mat{A} \\
-    \d|\mat A| &= |\mat A|\tr \mat{A}^{-1}\d\mat A \\
-    \d\log|\mat A| &= \tr\mat{A}^{-1}\d\mat A \\
-    \d\mat{X}^{-1} &= -\mat{X}^{-1}(\d\mat{X})\mat{X}^{-1}
-\end{align*}
-
-\section{Useful Matrix Identities}
-In this section we summarize a few useful matrix identities, for more details see for example \cite{MatrixAlgebra-AbadirMagnus2005}.
-
-For two matrices $\mat A$ of dimensions $a_1\times a_2$ and $\mat B$ of dimensions $b_1\times b_2$ holds
-\begin{equation}
-    \mat{K}_{b_1, a_1}(\mat{A}\otimes\mat{B})\mat{K}_{a_2, b_2} = \mat{B}\otimes\mat{A}
-\end{equation}
-as well as
-\begin{equation}\label{eq:vecKron}
-    \vec(\mat A\kron\mat B) = (\mat{I}_{a_2}\kron\mat{K}_{b_2,a_1}\kron\mat{I}_{b_1})(\vec\mat A\kron\vec\mat B).
-\end{equation}
-
-Let $\mat A$ be a $p\times p$ dimensional non-singular matrix. Furthermore, let $\mat a, \mat b$ be $p$ vectors such that $\t{\mat b}A^{-1}\mat a\neq -1$, then
-\begin{displaymath}
-    (\mat A + \mat a\t{\mat b})^{-1} = \mat{A}^{-1} - \frac{1}{1 + \t{\mat b}A^{-1}\mat a}\mat{A}^{-1}\mat{a}\t{\mat{b}}\mat{A}^{-1}
-\end{displaymath}
-as well as
-\begin{displaymath}
-    \det(\mat A + \mat a\t{\mat b}) = \det(\mat A)(1 + \t{\mat b}{\mat A}^{-1}\mat a)
-\end{displaymath}
-which even holds in the case $\t{\mat b}\mat{A}^{-1}\mat a = -1$. This is known as Sylvester's determinant theorem.
-
-
-\section{Commutation Matrix and Permutation Identities}
-\begin{center}
-    Note: In this section we use 0-indexing for the sake of simplicity!
-\end{center}
-In this section we summarize relations between the commutation matrix and corresponding permutation. We also list some extensions to ``simplify'' or represent some term. This is mostly intended for implementation purposes and understanding of terms occurring in the computations.
-
-Let $\mat A$ be an arbitrary $a_1\times a_2$ matrix. The permutation matrix $\mat K_{a_1, a_2}$ satisfies
-\begin{displaymath}
-    \mat{K}_{a_1, a_2}\vec{\mat{A}} = \vec{\t{\mat{A}}} \quad\Leftrightarrow\quad (\vec{\mat{A}})_{\pi_{p, q}(i)} = (\vec{\t{\mat{A}}})_{i}, \quad\text{for } i = 0, ..., p q - 1
-\end{displaymath}
-where $\pi_{p, q}$ is a permutation of the indices $i = 0, ..., p q - 1$ such that
-\begin{displaymath}
-    \pi_{p, q}(i + j p) = j + i q, \quad\text{for }i = 0, ..., p - 1; j = 0, ..., q - 1.
-\end{displaymath}
-
-\begin{table}[!htp]
-    \centering
-    \begin{minipage}{0.8\textwidth}
-        \centering
-        \begin{tabular}{l c l}
-            $\mat{K}_{p, q}$                 & $\hat{=}$ & $\pi_{p, q}(i + j p) = j + i q$ \\
-            $\mat{I}_r\kron\mat{K}_{p, q}$ & $\hat{=}$ & $\tilde{\pi}_{p, q, r}(i + j p + k p q) = j + i q + k p q$ \\
-            $\mat{K}_{p, q}\kron\mat{I}_r$ & $\hat{=}$ & $\hat{\pi}_{p, q, r}(i + j p + k p q) = r(j + i q) + k$
-        \end{tabular}
-    \caption{\label{tab:commutation-permutation}Commutation matrix terms and corresponding permutations. Indices are all 0-indexed with the ranges; $i = 0, ..., p - 1$, $j = 0, ..., q - 1$ and $k = 0, ..., r - 1$.}
-    \end{minipage}
-\end{table}
-
-
-
-\section{Matrix and Tensor Operations}
-
-The \emph{Kronecker product}\index{Operations!Kronecker@$\kron$ Kronecker product} is denoted as $\kron$ and the \emph{Hadamard product} uses the symbol $\circ$. We also need the \emph{Khatri-Rao product}\index{Operations!KhatriRao@$\hada$ Khatri-Rao product}
-$\hada$ as well as the \emph{Transposed Khatri-Rao product} $\odot_t$ (or \emph{Face-Splitting product}). There is also the \emph{$n$-mode Tensor Matrix Product}\index{Operations!ttm@$\ttm[n]$ $n$-mode tensor product} denoted by $\ttm[n]$ in conjunction with the \emph{$n$-mode Matricization} of a Tensor $\mat{T}$ written as $\mat{T}_{(n)}$, which is a matrix. See below for definitions and examples of these operations.\todo{ Definitions and Examples}
-
-\todo{ resolve confusion between Khatri-Rao, Column-wise Kronecker / Khatri-Rao, Row-wise Kronecker / Khatri-Rao, Face-Splitting Product, .... Yes, its a mess.}
-\paragraph{Kronecker Product $\kron$:}
-\paragraph{Khatri-Rao Product $\hada$:}
-\paragraph{Transposed Khatri-Rao Product $\odot_t$:} This is also known as the Face-Splitting Product and is the row-wise Kronecker product of two matrices. If relates to the Column-wise Kronecker Product through
-\begin{displaymath}
-    \t{(\mat{A}\odot_{t}\mat{B})} = \t{\mat{A}}\hada\t{\mat{B}}
-\end{displaymath}
-
-\paragraph{$n$-mode unfolding:} \emph{Unfolding}, also known as \emph{flattening} or \emph{matricization}, is an reshaping of a tensor into a matrix with rearrangement of the elements such that mode $n$ corresponds to columns of the result matrix and all other modes are vectorized in the rows. Let $\ten{T}$ be a tensor of order $m$ with dimensions $t_1\times ... \times t_n\times ... \times t_m$ and elements indexed by $(i_1, ..., i_n, ..., i_m)$. The $n$-mode flattening, denoted $\ten{T}_{(n)}$, is defined as a $(t_n, \prod_{k\neq n}t_k)$ matrix with element indices $(i_n, j)$ such that $j = \sum_{k = 1, k\neq n}^m i_k\prod_{l = 1, l\neq n}^{k - 1}t_l$.
-\todo{ give an example!}
-
-\paragraph{$n$-mode Tensor Product $\ttm[n]$:}
-The \emph{$n$-mode tensor product} $\ttm[n]$ between a tensor $\mat{T}$ of order $m$ with dimensions $t_1\times t_2\times ... \times t_n\times ... \times t_m$ and a $p\times t_n$ matrix $\mat{M}$ is defined element-wise as
-\begin{displaymath}
-    (\ten{T}\ttm[n] \mat{M})_{i_1, ..., i_{n-1}, j, i_{n+1}, ..., i_m} = \sum_{k = 1}^{t_n} \ten{T}_{i_1, ..., i_{n-1}, k, i_{n+1}, ..., i_m} \mat{M}_{j, k}
-\end{displaymath}
-where $i_1, ..., i_{n-1}, i_{n+1}, ..., i_m$ run from $1$ to $t_1, ..., t_{n-1}, t_{n+1}, ..., t_m$, respectively. Furthermore, the $n$-th fiber index $j$ of the product ranges from $1$ to $p$. This gives a new tensor $\mat{T}\ttm[n]\mat{M}$ of order $m$ with dimensions $t_1\times t_2\times ... \times p\times ... \times t_m$.
-
-\begin{example}[Matrix Multiplication Analogs]
-    Let $\mat{A}$, $\mat{B}$ be two matrices with dimensions $t_1\times t_2$ and $p\times q$, respectively. Then $\mat{A}$ is also a tensor of order $2$, now the $1$-mode and $2$-mode products are element wise given by
-    \begin{align*}
-        (\mat{A}\ttm[1] \mat{B})_{i,j} &= \sum_{l = 1}^{t_1} \mat{A}_{l,j}\mat{B}_{i,l}
-            = (\mat{B}\mat{A})_{i,j}
-                                        & \text{for }t_1 = q, \\
-        (\mat{A}\ttm[2] \mat{B})_{i,j} &= \sum_{l = 1}^{t_2} \mat{A}_{i,l}\mat{B}_{j,l}
-            = (\mat{A}\t{\mat{B}})_{i,j} = \t{(\mat{B}\t{\mat{A}})}_{i,j}
-                                        & \text{for }t_2 = q.
-    \end{align*}
-    In other words, the $1$-mode product equals $\mat{A}\ttm[1] \mat{B} = \mat{B}\mat{A}$ and the $2$-mode is $\mat{A}\ttm[2] \mat{B} = \t{(\mat{B}\t{\mat{A}})}$ in the case of the tensor $\mat{A}$ being a matrix.
-\end{example}
-
-\begin{example}[Order Three Analogs]
-    Let $\mat{A}$ be a tensor of the form $t_1\times t_2\times t_3$ and $\mat{B}$ a matrix of dimensions $p\times q$, then the $n$-mode products have the following look
-    \begin{align*}
-        (\mat{A}\ttm[1]\mat{B})_{i,j,k} &= \sum_{l = 1}^{t_1} \mat{A}_{l,j,k}\mat{B}_{i,l} & \text{for }t_1 = q, \\
-        (\mat{A}\ttm[2]\mat{B})_{i,j,k} &= \sum_{l = 1}^{t_2} \mat{A}_{i,l,k}\mat{B}_{j,l} \equiv (\mat{B}\mat{A}_{i,:,:})_{j,k} & \text{for }t_2 = q, \\
-        (\mat{A}\ttm[3]\mat{B})_{i,j,k} &= \sum_{l = 1}^{t_3} \mat{A}_{i,j,l}\mat{B}_{k,l} \equiv \t{(\mat{B}\t{\mat{A}_{i,:,:}})}_{j,k} & \text{for }t_3 = q.
-    \end{align*}
-\end{example}
-
-Letting $\ten{F}$ be the $3$-tensor of dimensions $n\times k\times r$ such that $\ten{F}_{i,:,:} = \mat{f}_{y_i}$, then
-\begin{displaymath}
-    \mat{\beta}\mat{f}_{y_i}\t{\mat{\alpha}} = (\ten{F}\ttm[2]\mat{\beta}\ttm[3]\mat{\alpha})_{i,:,:}
-\end{displaymath}
-or in other words, the $i$-th slice of the tensor product $\ten{F}\ttm[2]\mat{\beta}\ttm[3]\mat{\alpha}$ contains $\mat{\beta}\mat{f}_{y_i}\t{\mat{\alpha}}$ for $i = 1, ..., n$.
-Another analog way of writing this is
-\begin{displaymath}
-    (\ten{F}\ttm[2]\mat{\beta}\ttm[3]\mat{\alpha})_{(1)} = \mathbb{F}_{y}(\t{\mat{\alpha}}\kron\t{\mat{\beta}})
-\end{displaymath}
-
-\section{Equivalencies}
-In this section we give a short summary of alternative but equivalent operations.
-Using the notation $\widehat{=}$ to indicate that two expressions are identical in the sense that they contain the same element in the same order but may have different dimensions. Meaning, when vectorizing ether side of $\widehat{=}$, they are equal ($\mat{A}\widehat{=}\mat{B}\ :\Leftrightarrow\ \vec{\mat{A}} = \vec{\mat{B}}$).
-
-Therefore, we use $\mat{A}, \mat{B}, \mat{X}, \mat{F}, \mat{R}, ...$ for matrices. 3-Tensors are written as $\ten{A}, \ten{B}, \ten{T}, \ten{X}, \ten{F}, \ten{R}, ...$.
-
-\begin{align*}
-    \ten{T}\ttm[3]\mat{A}\ &{\widehat{=}}\ \mat{T}\t{\mat A} & \ten{T}(n, p, q)\ \widehat{=}\ \mat{T}(n p, q), \mat{A}(p, q) \\
-    \ten{T}\ttm[2]\mat{B}\ &{\widehat{=}}\ \mat{B}\ten{T}_{(2)} & \ten{T}(n, p, q), \ten{T}_{(2)}(p, n q), \mat{B}(q, p)
-\end{align*}
-
-% \section{Matrix Valued Normal Distribution}
-% A random variable $\mat{X}$ of dimensions $p\times q$ is \emph{Matrix-Valued Normal Distribution}, denoted 
-% \begin{displaymath}
-%     \mat{X}\sim\mathcal{MN}_{p\times q}(\mat{\mu}, \mat{\Delta}_2, \mat{\Delta}_1),
-% \end{displaymath}
-% if and only if $\vec\mat{X}\sim\mathcal{N}_{p q}(\vec\mat\mu, \mat\Delta_1\otimes\mat\Delta_2)$. Note the order of the covariance matrices $\mat\Delta_1, \mat\Delta_2$. Its density is given by
-% \begin{displaymath}
-%     f(\mat{X}) = \frac{1}{(2\pi)^{p q / 2}|\mat\Delta_1|^{p / 2}|\mat\Delta_2|^{q / 2}}\exp\left(-\frac{1}{2}\tr(\mat\Delta_1^{-1}\t{(\mat X - \mat \mu)}\mat\Delta_2^{-1}(\mat X - \mat \mu))\right).
-% \end{displaymath}
-
-% \section{Sampling form a Multi-Array Normal Distribution}
-% Let $\ten{X}$ be an order (rank) $r$ Multi-Array random variable of dimensions $p_1\times...\times p_r$ following a Multi-Array (or Tensor) Normal distributed
-% \begin{displaymath}
-%     \ten{X}\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r).
-% \end{displaymath}
-% Its density is given by
-% \begin{displaymath}
-%     f(\ten{X}) = \Big( \prod_{i = 1}^r \sqrt{(2\pi)^{p_i}|\mat{\Delta}_i|^{q_i}} \Big)^{-1}
-%         \exp\!\left( -\frac{1}{2}\langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle \right)
-% \end{displaymath}
-% with $q_i = \prod_{j \neq i}p_j$. This is equivalent to the vectorized $\vec\ten{X}$ following a Multi-Variate Normal distribution
-% \begin{displaymath}
-%     \vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1)
-% \end{displaymath}
-% with $p = \prod_{i = 1}^r p_i$.
-
-% \todo{Check this!!!}
-
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\subsection{Alternative covariance estimates}\label{sec:approx}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-An alternative approach is \emph{not} to use the MLE estimates for $\mat\Delta_1$, $\mat\Delta_2$ but (up to scaling) unbiased estimates.
-\begin{displaymath}
-    \widetilde{\mat\Delta}_1 = \frac{1}{n}\sum_{i = 1}^n \t{\mat{R}_i}\mat{R}_i {\color{gray}\quad(q\times q)},\qquad
-    \widetilde{\mat\Delta}_2 = \frac{1}{n}\sum_{i = 1}^n \mat{R}_i\t{\mat{R}_i} {\color{gray}\quad(p\times p)}.
-\end{displaymath}
-The unbiasednes comes directly from the following short computation;
-\begin{displaymath}
-    (\E\widetilde{\mat\Delta}_1)_{j,k} = \frac{1}{n}\sum_{i = 1}^n \sum_{l = 1}^p \E \mat{R}_{i,l,j}\mat{R}_{i,l,k}
-        = \frac{1}{n}\sum_{i = 1}^n \sum_{l = 1}^p (\mat{\Delta}_{2})_{l,l}(\mat{\Delta}_{1})_{j,k}
-        = (\mat\Delta_1\tr(\mat\Delta_2))_{j,k}.
-\end{displaymath}
-which means that $\E\widetilde{\mat\Delta}_1 = \mat\Delta_1\tr(\mat\Delta_2)$ and in analogy $\E\widetilde{\mat\Delta}_2 = \mat\Delta_2\tr(\mat\Delta_1)$. Now, we need to handle the scaling which can be estimated unbiasedly by
-\begin{displaymath}
-    \tilde{s} = \frac{1}{n}\sum_{i = 1}^n \|\mat{R}_i\|_F^2
-\end{displaymath}
-because with $\|\mat{R}_i\|_F^2 = \tr \mat{R}_i\t{\mat{R}_i} = \tr \t{\mat{R}_i}\mat{R}_i$ the scale estimate $\tilde{s} = \tr(\widetilde{\mat\Delta}_1) = \tr(\widetilde{\mat\Delta}_2)$. Then $\E\tilde{s} = \tr(\E\widetilde{\mat\Delta}_1) = \tr{\mat\Delta}_1 \tr{\mat\Delta}_2 = \tr({\mat\Delta}_1\otimes{\mat\Delta}_2)$. Leading to the estimate of the covariance as
-\begin{displaymath}
-    \widetilde{\mat\Delta} = \tilde{s}^{-1}(\widetilde{\mat{\Delta}}_1\otimes\widetilde{\mat{\Delta}}_2)
-\end{displaymath}
-
-\todo{ prove they are consistent, especially $\widetilde{\mat\Delta} = \tilde{s}^{-1}(\widetilde{\mat\Delta}_1\otimes\widetilde{\mat\Delta}_2)$!}
-
-The hoped for a benefit is that these covariance estimates are in a closed form which means there is no need for an additional iterative estimations step. Before we start with the derivation of the gradients define the following two quantities
-\begin{align*}
-    \mat{S}_1 = \frac{1}{n}\sum_{i = 1}^n \t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i = \frac{1}{n}\ten{R}_{(3)}\t{(\ten{R}\ttm[2]\widetilde{\mat{\Delta}}_2^{-1})_{(3)}}\quad{\color{gray}(q\times q)}, \\
-    \mat{S}_2 = \frac{1}{n}\sum_{i = 1}^n \mat{R}_i\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i} = \frac{1}{n}\ten{R}_{(2)}\t{(\ten{R}\ttm[3]\widetilde{\mat{\Delta}}_1^{-1})_{(2)}}\quad{\color{gray}(p\times p)}.
-\end{align*}
-\todo{Check tensor form!}
-
-Now, the matrix normal with the covariance matrix of the vectorized quantities of the form $\mat{\Delta} = s^{-1}(\mat{\Delta}_1\otimes\mat{\Delta}_2)$ has the form
-\begin{align*}
-    f(\mat R)
-        &= \frac{1}{\sqrt{(2\pi)^{p q}|\mat\Delta|}}\exp\left(-\frac{1}{2}\t{\vec(\mat{R})} \mat\Delta^{-1}\vec(\mat{R})\right) \\
-        &= \frac{s^{p q / 2}}{(2\pi)^{p q / 2}|\mat\Delta_1|^{p / 2}|\mat\Delta_2|^{q / 2}}\exp\left(-\frac{s}{2}\tr(\mat\Delta_1^{-1}\t{\mat{R}}\mat\Delta_2^{-1}\mat{R})\right)
-\end{align*}
-
-The approximated log-likelihood is then
-\begin{align*}
-    \tilde{l}(\mat\alpha, \mat\beta)
-        &=
-        -\frac{n p q}{2}\log{2\pi}
-        -\frac{n}{2}\log|\widetilde{\mat{\Delta}}|
-        -\frac{1}{2}\sum_{i = 1}^n \t{\mat{r}_i}\widetilde{\mat{\Delta}}^{-1}\mat{r}_i \\
-        &=
-        -\frac{n p q}{2}\log{2\pi}
-        +\frac{n p q}{2}\log\tilde{s}
-        -\frac{n p}{2}\log|\widetilde{\mat{\Delta}}_1|
-        -\frac{n q}{2}\log|\widetilde{\mat{\Delta}}_2|
-        -\frac{\tilde{s}}{2}\sum_{i = 1}^n \tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i).
-\end{align*}
-The second form is due to the property of the determinant for scaling and the Kronecker product giving that $|\widetilde{\mat\Delta}| = (\tilde{s}^{-1})^{p q}|\widetilde{\mat{\Delta}}_1|^p |\widetilde{\mat{\Delta}}_2|^q$ as well as an analog Kronecker decomposition as in the MLE case.
-
-Note that with the following holds
-\begin{displaymath}
-    \sum_{i = 1}^n \tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i)
-    = n \tr(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1)
-    = n \tr(\widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2)
-    = n \tr(\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1})
-    = n \tr(\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1}).
-\end{displaymath}
-
-The derivation of the Gradient of the approximated log-likelihood $\tilde{l}$ is tedious but straight forward. We tackle the summands separately;
-\begin{align*}
-    \d\log\tilde{s} &= \tilde{s}^{-1}\d\tilde{s} = \frac{2}{n\tilde{s}}\sum_{i = 1}^n \tr(\t{\mat{R}_i}\d\mat{R}_i)
-        = -\frac{2}{n\tilde{s}}\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\mat{R}_i\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\t{\mat{R}_i}\d\mat{\beta}), \\
-    \d\log|\widetilde{\mat{\Delta}}_1| &=\tr(\widetilde{\mat{\Delta}}_1^{-1}\d\widetilde{\mat{\Delta}}_1) = \frac{2}{n}\sum_{i = 1}^n \tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\d\mat{R}_i)
-        = -\frac{2}{n}\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\mat{R}_i\widetilde{\mat{\Delta}}_1^{-1}\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\d\mat{\beta}), \\
-    \d\log|\widetilde{\mat{\Delta}}_2| &=\tr(\widetilde{\mat{\Delta}}_2^{-1}\d\widetilde{\mat{\Delta}}_2) = \frac{2}{n}\sum_{i = 1}^n \tr(\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\d\mat{R}_i)
-        = -\frac{2}{n}\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\d\mat{\beta})
-\end{align*}
-as well as
-\begin{displaymath}
-    \d\,\tilde{s}\sum_{i = 1}^n \tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i)
-        = (\d\tilde{s})\sum_{i = 1}^n \tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i)
-            + \tilde{s}\, \d \sum_{i = 1}^n \tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i).
-\end{displaymath}
-We have
-\begin{displaymath}
-    \d\tilde{s} = -\frac{2}{n}\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\mat{R}_i\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\t{\mat{R}_i}\d\mat{\beta})
-\end{displaymath}
-and the remaining term
-\begin{align*}
-    \d\sum_{i = 1}^n\tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i)
-         = 2\sum_{i = 1}^n \tr(&\t{\mat{f}_{y_i}}\t{\mat{\beta }}(\mat{R}_i   \widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1}  +    \widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1} \mat{R}_i - \widetilde{\mat{\Delta}}_2^{-1}   \mat{R}_i \widetilde{\mat{\Delta}}_1^{-1})\d\mat{\alpha} \\
-         +\,&\mat{f}_{y_i} \t{\mat{\alpha}}(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i} + \t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1} - \widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1})\d\mat{\beta }).
-\end{align*}
-The last one is tedious but straight forward. Its computation extensively uses the symmetry of $\widetilde{\mat{\Delta}}_1$, $\widetilde{\mat{\Delta}}_2$, the cyclic property of the trace and the relation $\d\mat{A}^{-1} = -\mat{A}^{-1}(\d\mat{A})\mat{A}^{-1}$.
-
-Putting it all together
-\begin{align*}
-    \d\tilde{l}(\mat{\alpha}, \mat{\beta})
-       &= \frac{n p q}{2}\Big(-\frac{2}{n\tilde{s}}\Big)\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\mat{R}_i\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\t{\mat{R}_i}\d\mat{\beta}) \\
-       &\hspace{3em} - \frac{n p}{2}\Big(-\frac{2}{n}\Big)\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\mat{R}_i\widetilde{\mat{\Delta}}_1^{-1}\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\d\mat{\beta}) \\
-       &\hspace{3em} - \frac{n q}{2}\Big(-\frac{2}{n}\Big)\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\d\mat{\beta}) \\
-       &\hspace{3em} -\frac{1}{2}\Big(-\frac{2}{n}\Big)\Big(\sum_{i = 1}^n \tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i)\Big)\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\mat{R}_i\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\t{\mat{R}_i}\d\mat{\beta}) \\
-       &\hspace{3em} -\frac{\tilde{s}}{2}2\sum_{i = 1}^n \tr\!\Big(\t{\mat{f}_{y_i}}\t{\mat{\beta }}(\mat{R}_i \widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1} + \widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1} \mat{R}_i - \widetilde{\mat{\Delta}}_2^{-1} \mat{R}_i \widetilde{\mat{\Delta}}_1^{-1})\d\mat{\alpha} \\
-       &\hspace{3em} \hspace{4.7em} + \mat{f}_{y_i} \t{\mat{\alpha}}(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i} + \t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1} - \widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1})\d\mat{\beta }\Big) \\
-%
-       &= \sum_{i = 1}^n \tr\bigg(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\Big(
-            -p q \tilde{s}^{-1} \mat{R}_i + p \mat{R}_i\widetilde{\mat{\Delta}}_1^{-1} + q \widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i + \tr(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1)\mat{R}_i \\
-       &\hspace{3em} \hspace{4.7em} - \tilde{s}(\mat{R}_i \widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1} + \widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1} \mat{R}_i - \widetilde{\mat{\Delta}}_2^{-1} \mat{R}_i \widetilde{\mat{\Delta}}_1^{-1})
-       \Big)\d\mat{\alpha}\bigg) \\
-       &\hspace{3em}+ \sum_{i = 1}^n \tr\bigg(\mat{f}_{y_i}\t{\mat{\alpha}}\Big(
-            -p q \tilde{s}^{-1} \t{\mat{R}_i} + p \widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i} + q \t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1} + \tr(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1)\t{\mat{R}_i} \\
-       &\hspace{3em}\hspace{3em} \hspace{4.7em} - \tilde{s}(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i} + \t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1} - \widetilde{\mat{\Delta}}_1^{-1} \t{\mat{R}_i} \widetilde{\mat{\Delta}}_2^{-1})
-       \Big)\d\mat{\beta}\bigg).
-\end{align*}
-Observe that the bracketed expressions before $\d\mat{\alpha}$ and $\d\mat{\beta}$ are transposes. Lets denote the expression for $\d\mat{\alpha}$ as $\mat{G}_i$ which has the form
-\begin{displaymath}
-    \mat{G}_i
-        = (\tr(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1) - p q \tilde{s}^{-1})\mat{R}_i
-        + (q\mat{I}_p - \tilde{s}\widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2)\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i
-        + \mat{R}_i\widetilde{\mat{\Delta}}_1^{-1}(p\mat{I}_q - \tilde{s}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1})
-        + \tilde{s}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i\widetilde{\mat{\Delta}}_1^{-1}
-\end{displaymath}
-and with $\mathcal{G}$ the order 3 tensor stacking the $\mat{G}_i$'s such that the first mode indexes the observation
-\begin{displaymath}
-    \ten{G}
-        = (\tr(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1) - p q \tilde{s}^{-1})\ten{R}
-        + \ten{R}\ttm[2](q\mat{I}_p - \tilde{s}\widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2)\widetilde{\mat{\Delta}}_2^{-1}
-        + \ten{R}\ttm[3](p\mat{I}_q - \tilde{s}\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1)\widetilde{\mat{\Delta}}_1^{-1}
-        + \tilde{s}\ten{R}\ttm[2]\widetilde{\mat{\Delta}}_2^{-1}\ttm[3]\widetilde{\mat{\Delta}}_1^{-1}
-\end{displaymath}
-This leads to the following form of the differential of $\tilde{l}$ given by
-\begin{displaymath}
-    \d\tilde{l}(\mat{\alpha}, \mat{\beta})
-        = \sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\mat{G}_i\d\mat{\alpha})
-        + \sum_{i = 1}^n \tr(\mat{f}_{y_i}\t{\mat{\alpha}}\t{\mat{G}_i}\d\mat{\beta})
-\end{displaymath}
-and therefore the gradients
-\begin{align*}
-    \nabla_{\mat{\alpha}}\tilde{l}(\mat{\alpha}, \mat{\beta}) &= \sum_{i = 1}^n \t{\mat{G}_i}\mat{\beta}\mat{f}_{y_i}
-        = \ten{G}_{(3)}\t{(\ten{F}\ttm[2]\mat{\beta})_{(3)}}, \\
-    \nabla_{\mat{\beta}} \tilde{l}(\mat{\alpha}, \mat{\beta}) &= \sum_{i = 1}^n \mat{G}_i\mat{\alpha}\t{\mat{f}_{y_i}}
-        = \ten{G}_{(2)}\t{(\ten{F}\ttm[3]\mat{\alpha})_{(2)}}.
-\end{align*}
-
-\todo{check the tensor version of the gradient!!!}
-
-\newpage
-
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%%                           Reference Summaries                            %%%
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-\section{Reference Summaries}
-This section contains short summaries of the main references with each sub-section concerning one paper.
-
-\subsection{}
-
-\subsection{Generalized Tensor Decomposition With Features on Multiple Modes}
-The \cite{TensorDecomp-HuLeeWang2022} paper proposes a multi-linear conditional mean model for a constraint rank tensor decomposition. Let the responses $\ten{Y}\in\mathbb{R}^{d_1\times ... \times\d_K}$ be an order $K$ tensor. Associated with each mode $k\in[K]$ they assume feature matrices $\mat{X}_k\in\mathbb{R}^{d_k\times p_k}$. Now, they assume that conditional on the feature matrices $\mat{X}_k$ the entries of the tensor $\ten{Y}$ are independent realizations. The rank constraint is specified through $\mat{r} = (r_1, ..., r_K)$, then the model is given by
-\begin{displaymath}
-    \E(\ten{Y} | \mat{X}_1, ..., \mat{X}_K) = f(\ten{C}\times\{ \mat{X}_1\mat{M}_1, ..., \mat{X}_K\mat{M}_K \}),\qquad \t{\mat{M}_k}\mat{M}_k = \mat{I}_{r_k}\ \forall k\in[K].
-\end{displaymath}
-The order $K$ tensor $\ten{C}\in\mathbb{R}^{r_1\times...\times r_K}$ is an unknown full-rank core tensor and the matrices $\mat{M}_k\in\mathbb{R}^{p_k\times r_k}$ are unknown factor matrices. The function $f$ is applied element wise and serves as the link function based on the assumed distribution family of the tensor entries. Finally, the operation $\times$ denotes the tensor-by-matrix product using a short hand
-\begin{displaymath}
-    \ten{C}\times\{ \mat{X}_1\mat{M}_1, ..., \mat{X}_K\mat{M}_K \}
-        = \ten{C}\ttm[1]\mat{X}_1\mat{M}_1\ ...\ttm[K]\mat{X}_K\mat{M}_K
-\end{displaymath}
-with $\ttm[k]$ denoting the $k$-mode tensor matrix product.
-
-The algorithm for estimation of $\ten{C}$ and $\mat{M}_1, ..., \mat{M}_K$ assumes the individual conditional entries of $\ten{Y}$ to be independent and to follow a generalized linear model with link function $f$. The proposed algorithm is an iterative algorithm for minimizing the negative log-likelihood
-\begin{displaymath}
-    l(\ten{C}, \mat{M}_1, ..., \mat{M}_K) = \langle \ten{Y}, \Theta \rangle - \sum_{i_1, ..., i_K} b(\Theta_{i_1, ..., i_K}), \qquad \Theta = \ten{C}\times\{ \mat{X}_1\mat{M}_1, ..., \mat{X}_K\mat{M}_K \}
-\end{displaymath}
-where $b = f'$ it the derivative of the canonical link function $f$ in the generalized linear model the conditioned entries of $\ten{Y}$ follow. The algorithm utilizes the higher-order SVD (HOSVD) to enforce the rank-constraint.
-
-The main benefit is that this approach generalizes well to a multitude of different structured data sets.
-
-\todo{ how does this relate to the $\mat{X} = \mat{\mu} + \mat{\beta}\mat{f}_y\t{\mat{\alpha}} + \mat{\epsilon}$ model.}
-
-\end{document}
diff --git a/sim/plots.R b/sim/plots.R
index 328518c..d657356 100644
--- a/sim/plots.R
+++ b/sim/plots.R
@@ -3,86 +3,98 @@ if (!endsWith(getwd(), "/sim")) {
     setwd("sim")
 }
 
-file.prefix <- "sim-ising-small"
-date <- "20221012"  # yyyymmdd, to match all "[0-9]{6}"
-time <- "[0-9]{4}"  # HHMM,     to match all "[0-9]{4}"
-sim <- Reduce(rbind, Map(function(path) {
-    df <- read.csv(path)
-    df$n <- as.integer(tail(head(strsplit(path, "[-.]")[[1]], -1), 1))
-    df
-}, list.files(".", pattern = paste0(
-    "^", file.prefix, "-", date, "T", time, "-[0-9]+[.]csv$", collapse = ""
-))))
+sim.plot <- function(file.prefix, date, to.file = FALSE) {
 
-stats <- aggregate(. ~ n, sim, mean)
-q75 <- aggregate(. ~ n, sim, function(x) quantile(x, 0.75))
-q25 <- aggregate(. ~ n, sim, function(x) quantile(x, 0.25))
+    # file.prefix <- "sim-ising-small"
+    # # file.prefix <- "sim-normal"
+    # date <- "20221012"  # yyyymmdd, to match all "[0-9]{6}"
+    time <- "[0-9]{4}"  # HHMM,     to match all "[0-9]{4}"
+    colors <- c(
+        PCA   = "#a11414",
+        HOPCA = "#2a62b6",
+        TSIR  = "#9313b9",
+        GMLM  = "#247407"
+    )
+    line.width <- 1.75
+    margins <- c(5.1, 4.1, 4.1, 0.1)
 
-colors <- c(gmlm = "#247407", hopca = "#2a62b6", pca = "#a11414", tsir = "#9313b9")
-line.width <- 1.75
-margins <- c(5.1, 4.1, 4.1, 0.1)
+    sim <- Reduce(rbind, Map(function(path) {
+        df <- read.csv(path)
+        df$n <- as.integer(tail(head(strsplit(path, "[-.]")[[1]], -1), 1))
+        df
+    }, list.files(".", pattern = paste0(
+        "^", file.prefix, "-", date, "T", time, "-[0-9]+[.]csv$", collapse = ""
+    ))))
 
+    stats <- aggregate(. ~ n, sim, mean)
+    q75 <- aggregate(. ~ n, sim, function(x) quantile(x, 0.75))
+    q25 <- aggregate(. ~ n, sim, function(x) quantile(x, 0.25))
 
-layout(mat = matrix(c(
-    1, 2,
-    3, 3
-), 2, 2, byrow = TRUE),
-    widths = c(1, 1),
-    heights = c(8, 1), respect = FALSE)
-# layout.show(3)
+    if (to.file) {
+        width <- 720
+        png(paste(file.prefix, "-", date, ".png", sep = ""),
+            width = width, height = round((6 / 11) * width, -1),
+            pointsize = 12)
+    }
 
-with(stats, {
-    par(mar = margins)
-    plot(range(n), c(0, 1.05),
-        type = "n", bty = "n", main = "Subspace Distance",
-        xlab = "Sample Size", ylab = "Error")
-    lines(n, dist.subspace.gmlm,  col = colors["gmlm"],  lwd = line.width)
-    lines(n, dist.subspace.hopca, col = colors["hopca"], lwd = line.width)
-    lines(n, dist.subspace.pca,   col = colors["pca"],   lwd = line.width)
-    lines(n, dist.subspace.tsir,  col = colors["tsir"],  lwd = line.width)
+    layout(mat = matrix(c(
+        1, 2,
+        3, 3
+    ), 2, 2, byrow = TRUE),
+        widths = c(1, 1),
+        heights = c(12, 1), respect = FALSE)
+    # layout.show(3)
 
-    xn <- c(q75$n, rev(q25$n))
-    polygon(x = xn, y = c(q75$dist.subspace.gmlm, rev(q25$dist.subspace.gmlm)),
-        col = adjustcolor(colors["gmlm"], alpha.f = 0.3), border = NA)
-    polygon(x = xn, y = c(q75$dist.subspace.hopca, rev(q25$dist.subspace.hopca)),
-        col = adjustcolor(colors["hopca"], alpha.f = 0.3), border = NA)
-    polygon(x = xn, y = c(q75$dist.subspace.pca, rev(q25$dist.subspace.pca)),
-        col = adjustcolor(colors["pca"], alpha.f = 0.3), border = NA)
-    polygon(x = xn, y = c(q75$dist.subspace.tsir, rev(q25$dist.subspace.tsir)),
-        col = adjustcolor(colors["tsir"], alpha.f = 0.3), border = NA)
+    with(stats, {
+        par(mar = margins)
+        plot(range(n), 0:1,
+            type = "n", bty = "n", main = "Subspace Distance",
+            xlab = "Sample Size", ylab = "Error")
+        lines(n, dist.subspace.gmlm,  col = colors["GMLM"],  lwd = line.width)
+        lines(n, dist.subspace.hopca, col = colors["HOPCA"], lwd = line.width)
+        lines(n, dist.subspace.pca,   col = colors["PCA"],   lwd = line.width)
+        lines(n, dist.subspace.tsir,  col = colors["TSIR"],  lwd = line.width)
 
-    # par(mar = rep(0, 4))
-    # legend("topright", legend = names(colors), col = colors, lwd = line.width,
-    #     lty = 1, bty = "n")
-    # par(mar = margins)
-})
+        xn <- c(q75$n, rev(q25$n))
+        polygon(x = xn, y = c(q75$dist.subspace.gmlm, rev(q25$dist.subspace.gmlm)),
+            col = adjustcolor(colors["GMLM"], alpha.f = 0.3), border = NA)
+        polygon(x = xn, y = c(q75$dist.subspace.hopca, rev(q25$dist.subspace.hopca)),
+            col = adjustcolor(colors["HOPCA"], alpha.f = 0.3), border = NA)
+        polygon(x = xn, y = c(q75$dist.subspace.pca, rev(q25$dist.subspace.pca)),
+            col = adjustcolor(colors["PCA"], alpha.f = 0.3), border = NA)
+        polygon(x = xn, y = c(q75$dist.subspace.tsir, rev(q25$dist.subspace.tsir)),
+            col = adjustcolor(colors["TSIR"], alpha.f = 0.3), border = NA)
+    })
 
-with(stats, {
-    par(mar = margins)
-    plot(range(n), c(0, 1.05),
-        type = "n", bty = "n", main = "RMSE (Prediction Error)",
-        xlab = "Sample Size", ylab = "Error")
-    xn <- c(q75$n, rev(q25$n))
-    polygon(x = xn, y = c(q75$error.pred.gmlm, rev(q25$error.pred.gmlm)),
-        col = adjustcolor(colors["gmlm"], alpha.f = 0.3), border = NA)
-    polygon(x = xn, y = c(q75$error.pred.hopca, rev(q25$error.pred.hopca)),
-        col = adjustcolor(colors["hopca"], alpha.f = 0.3), border = NA)
-    polygon(x = xn, y = c(q75$error.pred.pca, rev(q25$error.pred.pca)),
-        col = adjustcolor(colors["pca"], alpha.f = 0.3), border = NA)
-    polygon(x = xn, y = c(q75$error.pred.tsir, rev(q25$error.pred.tsir)),
-        col = adjustcolor(colors["tsir"], alpha.f = 0.3), border = NA)
-    lines(n, error.pred.gmlm,  col = colors["gmlm"],  lwd = line.width)
-    lines(n, error.pred.hopca, col = colors["hopca"], lwd = line.width)
-    lines(n, error.pred.pca,   col = colors["pca"],   lwd = line.width)
-    lines(n, error.pred.tsir,  col = colors["tsir"],  lwd = line.width)
+    with(stats, {
+        par(mar = margins)
+        plot(range(n), 0:1,
+            type = "n", bty = "n", main = "RMSE (Prediction Error)",
+            xlab = "Sample Size", ylab = "Error")
+        xn <- c(q75$n, rev(q25$n))
+        polygon(x = xn, y = c(q75$error.pred.gmlm, rev(q25$error.pred.gmlm)),
+            col = adjustcolor(colors["GMLM"], alpha.f = 0.3), border = NA)
+        polygon(x = xn, y = c(q75$error.pred.hopca, rev(q25$error.pred.hopca)),
+            col = adjustcolor(colors["HOPCA"], alpha.f = 0.3), border = NA)
+        polygon(x = xn, y = c(q75$error.pred.pca, rev(q25$error.pred.pca)),
+            col = adjustcolor(colors["PCA"], alpha.f = 0.3), border = NA)
+        polygon(x = xn, y = c(q75$error.pred.tsir, rev(q25$error.pred.tsir)),
+            col = adjustcolor(colors["TSIR"], alpha.f = 0.3), border = NA)
+        lines(n, error.pred.gmlm,  col = colors["GMLM"],  lwd = line.width)
+        lines(n, error.pred.hopca, col = colors["HOPCA"], lwd = line.width)
+        lines(n, error.pred.pca,   col = colors["PCA"],   lwd = line.width)
+        lines(n, error.pred.tsir,  col = colors["TSIR"],  lwd = line.width)
+    })
 
-    # par(mar = rep(0, 4))
-    # legend("topright", legend = names(colors), col = colors, lwd = line.width,
-    #     lty = 1, bty = "n")
-    # par(mar = margins)
-})
+    par(mar = rep(0, 4))
+    plot(1:2, 1:2, type = "n", bty = "n", axes = FALSE, xlab = "", ylab = "")
+    legend("center", legend = names(colors), col = colors, lwd = line.width,
+        lty = 1, bty = "n", horiz = TRUE)
 
-par(mar = c(0, 1, 1, 0))
-plot(1:2, 1:2, type = "n", bty = "n", axes = FALSE, xlab = "", ylab = "")
-legend("center", legend = names(colors), col = colors, lwd = line.width,
-    lty = 1, bty = "n", horiz = TRUE)
+    if (to.file) {
+        dev.off()
+    }
+}
+
+sim.plot("sim-ising-small", "20221012", TRUE)
+sim.plot("sim-normal", "20221012", TRUE)
diff --git a/tensorPredictors/R/GMLM.R b/tensorPredictors/R/GMLM.R
index f80c5d3..060c398 100644
--- a/tensorPredictors/R/GMLM.R
+++ b/tensorPredictors/R/GMLM.R
@@ -45,7 +45,7 @@ GMLM.default <- function(X, Fy, sample.axis = 1L,
             #                  eta1         alphas       Omegas
             family$grad(X, Fy, params[[1]], params[[2]], params[[3]])
         },
-        params = family$initialize(X, Fy), # initialen parameter estimates
+        params = family$initialize(X, Fy), # initial parameter estimates
         fun.lincomb = function(a, lhs, b, rhs) {
             list(
                 a * lhs[[1]] + b * rhs[[1]],
diff --git a/tensorPredictors/R/make_gmlm_family.R b/tensorPredictors/R/make_gmlm_family.R
index 513bb03..8d06a93 100644
--- a/tensorPredictors/R/make_gmlm_family.R
+++ b/tensorPredictors/R/make_gmlm_family.R
@@ -59,6 +59,7 @@ make.gmlm.family <- function(name) {
             # scaled negative log-likelihood
             log.likelihood <- function(X, Fy, eta1, alphas, Omegas) {
                 n <- tail(dim(X), 1)        # sample size
+                p <- head(dim(X), -1)       # predictor dimensions
 
                 # conditional mean
                 mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Map(solve, Omegas))
diff --git a/tensorPredictors/tests/testthat/test-TensorNormalFamily.R b/tensorPredictors/tests/testthat/test-TensorNormalFamily.R
index d6a5b2e..57159fa 100644
--- a/tensorPredictors/tests/testthat/test-TensorNormalFamily.R
+++ b/tensorPredictors/tests/testthat/test-TensorNormalFamily.R
@@ -78,8 +78,8 @@ test_that("Tensor-Normal moments of the sufficient statistic t(X)", {
     eta_y1 <- eta1 # + mlm(Fy, alphas) = + 0
     eta_y2 <- -0.5 * Reduce(`%x%`, rev(Omegas))
 
-    # draw a sample
-    X <- rtensornorm(n, mu, Deltas, r + 1L)
+    # # draw a sample
+    # X <- rtensornorm(n, mu, Deltas, r + 1L)
 
     # tensor normal log-partition function given eta_y
     log.partition <- function(eta_y1, eta_y2) {
@@ -117,14 +117,14 @@ test_that("Tensor-Normal moments of the sufficient statistic t(X)", {
     # Et2.est <- colMeans(rowKronecker(mat(X, r + 1L), mat(X, r + 1L)))
 
     # (analytic) convariance blocks of the sufficient statistic
-    #   Ctij = Cov(ti(X), tj(X) | Y = y)
+    #   Ctij = Cov(ti(X), tj(X) | Y = y) for i, j = 1, 2
     Ct11.ana <- Reduce(`%x%`, rev(Deltas))
     Ct12.ana <- local({
         # Analytic solution / `vec(mu %x% Delta) + vec(mu' %x% Delta)`
-        ct12 <- 2 * c(mu) %x% Reduce(`%x%`, rev(Deltas))
+        ct12 <- c(mu) %x% Reduce(`%x%`, rev(Deltas))
         # and symmetrize!
         dim(ct12) <- rep(prod(p), 3)
-        (1 / 2) * (ct12 + aperm(ct12, c(3, 1, 2)))
+        ct12 + aperm(ct12, c(3, 1, 2))
     })
     Ct22.ana <- local({
         Delta <- Reduce(`%x%`, rev(Deltas)) # Ct11.ana