wip: LaTeX MLE estimates

This commit is contained in:
Daniel Kapla 2022-05-24 14:47:32 +02:00
parent a2963024ef
commit b94b48091e
1 changed files with 175 additions and 102 deletions

View File

@ -50,6 +50,7 @@
\newcommand{\ten}[1]{\mathcal{#1}}
\renewcommand{\vec}{\operatorname{vec}}
\newcommand{\dist}{\operatorname{dist}}
\newcommand{\rank}{\operatorname{rank}}
\DeclareMathOperator{\kron}{\otimes} % Kronecker Product
\DeclareMathOperator{\hada}{\odot} % Hadamard Product
\newcommand{\ttm}[1][n]{\times_{#1}} % n-mode product (Tensor Times Matrix)
@ -81,44 +82,48 @@
\maketitle
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% Introduction %%%
%%% Preliminary %%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Notation}
We start with a brief summary of the used notation.
Vectors are write as boldface lowercase letters (e.g. $\mat a$, $\mat b$), matrices use boldface uppercase or Greek letters (e.g. $\mat A$, $\mat B$, $\mat\alpha$, $\mat\Delta$). The identity matrix of dimensions $p\times p$ is denoted by $\mat{I}_p$ and the commutation matrix as $\mat{K}_{p, q}$ or $\mat{K}_p$ is case of $p = q$. Tensors, meaning multi-dimensional arrays of order at least 3, use uppercase calligraphic letters (e.g. $\ten{A}$, $\ten{B}$, $\ten{X}$, $\ten{Y}$, $\ten{F}$). Boldface indices (e.g. $\mat{i}, \mat{j}, \mat{k}$) denote multi-indices $\mat{i} = (i_1, ..., i_r)\in[\mat{d}]$ where the bracket notation is a shorthand for $[r] = \{1, ..., r\}$ which in conjunction with a multi-index as argument means $[\mat{d}] = [d_1]\times ... \times[d_K]$.
\todo{write this}
Let $\ten{A} = (a_{i_1,...,i_r})\in\mathbb{R}^{d_1\times ...\times d_r}$ be an order\footnote{Also called rank, therefore the variable name $r$, but this term is \emph{not} used as it leads to confusion with the rank as in ``the rank of a matrix''.} $r$ tensor where $r\in\mathbb{N}$ is the number of modes or axis of $\ten{A}$. For matrices $\mat{B}_k\in\mathbb{R}^{p_k\times d_k}$ with $k\in[r] = \{1, 2, ..., r\}$ the \emph{multi-linear multiplication} is defined element wise as
\begin{displaymath}
(\ten{A}\times\{\mat{B}_1, ..., \mat{B}_r\})_{j_1, ..., j_r} = \sum_{i_1, ..., i_r = 1}^{d_1, ..., d_r} a_{i_1, ..., i_r}(B_{1})_{j_1, i_1} \cdots (B_{r})_{j_r, i_r}
\end{displaymath}
which results in an order $r$ tensor of dimensions $p_1\times ...\times p_k)$. With this the \emph{$k$-mode product} between the tensor $\ten{A}$ with the matrix $\mat{B}_k$ is given by
\begin{displaymath}
\mat{A}\times_k\mat{B}_k = \ten{A}\times\{\mat{I}_{d_1}, ..., \mat{I}_{d_{k-1}}, \mat{B}_{k}, \mat{I}_{d_{k+1}}, ..., \mat{I}_{d_r}\}.
\end{displaymath}
Furthermore, the notation $\ten{A}\times_{k\in S}$ is a short hand for writing the iterative application if the mode product for all indices in $S\subset[r]$. For example $\ten{A}\times_{k\in\{2, 5\}}\mat{B}_k = \ten{A}\times_2\mat{B}_2\times_5\mat{B}_5$. By only allowing $S$ to be a set this notation is unambiguous because the mode products commutes for different modes $j\neq k\Rightarrow\ten{A}\times_j\mat{B}_j\times_k\mat{B}_k = \ten{A}\times_k\mat{B}_k\times_j\mat{B}_j$.
Let $\ten{A}$ be a multi-dimensional array of order (rank) $r$ with dimensions $p_1\times ... \times p_r$ and the matrices $\mat{B}_i$ of dimensions $q_i\times p_i$ for $i = 1, ..., r$, then
The \emph{inner product} between two tensors of the same order and dimensions is
\begin{displaymath}
\ten{A} \ttm[1] \mat{B}_1 \ttm[2] \ldots \ttm[r] \mat{B}_r
= \ten{A}\times\{ \mat{B}_1, ..., \mat{B}_r \}
= \ten{A}\times_{i\in[r]} \mat{B}_i
= (\ten{A}\times_{i\in[r]\backslash j} \mat{B}_i)\ttm[j]\mat{B}_j
\langle\ten{A}, \ten{B}\rangle = \sum_{i_1, ..., i_r} a_{i_1, ..., i_r}b_{i_1, ..., i_r}
\end{displaymath}
As an alternative example consider
with which the \emph{Frobenius Norm} $\|\ten{A}\|_F = \sqrt{\langle\ten{A}, \ten{A}\rangle}$. Of interest is also the \emph{maximum norm} $\|\ten{A}\|_{\infty} = \max_{i_1, ..., i_K} a_{i_1, ..., i_K}$. Furthermore, the Frobenius and maximum norm are also used for matrices while for a vector $\mat{a}$ the \emph{2 norm} is $\|\mat{a}\|_2 = \sqrt{\langle\mat{a}, \mat{a}\rangle}$.
Matrices and tensor can be \emph{vectorized} by the \emph{vectorization} operator $\vec$. For tensors of order at least $2$ the \emph{flattening} (or \emph{unfolding} or \emph{matricization}) is a reshaping of the tensor into a matrix along an particular mode. For a tensor $\ten{A}$ of order $r$ and dimensions $d_1, ..., d_r$ the $k$-mode unfolding $\ten{A}_{(k)}$ is a $d_k\times \prod_{l=1, l\neq k}d_l$ matrix. For the tensor $\ten{A} = (a_{i_1,...,i_r})\in\mathbb{R}^{d_1, ..., d_r}$ the elements of the $k$ unfolded tensor $\ten{A}_{(k)}$ are
\begin{displaymath}
\ten{A}\times_2\mat{B}_2\times_3\mat{B}_3 = \ten{A}\times\{ \mat{I}, \mat{B}_2, \mat{B}_3 \} = \ten{A}\times_{i\in\{2, 3\}}\mat{B}_i
\end{displaymath}
Another example
\begin{displaymath}
\mat{B}\mat{A}\t{\mat{C}} = \mat{A}\times_1\mat{B}\times_2\mat{C}
= \mat{A}\times\{\mat{B}, \mat{C}\}
(\ten{A}_{(k)})_{i_k, j} = a_{i_1, ..., i_r}\quad\text{ with }\quad j = 1 + \sum_{\substack{l = 1\\l \neq k}}^r (i_l - 1) \prod_{\substack{m = 1\\m\neq k}}^{l - 1}d_m.
\end{displaymath}
\begin{displaymath}
(\ten{A}\ttm[i]\mat{B})_{(i)} = \mat{B}\ten{A}_{(i)}
\end{displaymath}
The rank of a tensor $\ten{A}$ of dimensions $d_1\times ...\times d_r$ is given by a vector $\rank{\ten{A}} = (a_1, ..., a_r)\in[d_1]\times...\times[d_r]$ where $a_k = \rank(\ten{A}_{(k)})$ is the usual matrix rank of the $k$ unfolded tensor.
{\color{red}$\mathcal{S}^p$, $\mathcal{S}_{+}^p$, $\mathcal{S}_{++}^p$ symmetric matrices of dimensions $p\times p$, or call it $\operatorname{Sym}(p)$}
{\color{red}The group of orthogonas matrices $O(p)$ of dim $p\times p$, where $O(p, q)$ are the $p\times q$ matrices (a.k.a. the Stiefel manifold)}
\todo{continue}
\section{Tensor Normal Distribution}
Let $\ten{X}$ be a multi-dimensional array random variable of order (rank) $r$ with dimensions $p_1\times ... \times p_r$ written as
Let $\ten{X}$ be a multi-dimensional array random variable of order $r$ with dimensions $p_1\times ... \times p_r$ written as
\begin{displaymath}
\ten{X}\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r).
\end{displaymath}
Its density is given by
\begin{displaymath}
f(\ten{X}) = \Big( \prod_{i = 1}^r \sqrt{(2\pi)^{p_i}|\mat{\Delta}_i|^{p_{-i}}} \Big)^{-1}
f(\ten{X}) = \Big( \prod_{i = 1}^r \sqrt{(2\pi)^{p_i}|\mat{\Delta}_i|^{p_{\lnot i}}} \Big)^{-1}
\exp\!\left( -\frac{1}{2}\langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle \right)
\end{displaymath}
where $p_{\lnot i} = \prod_{j \neq i}p_j$. This is equivalent to the vectorized $\vec\ten{X}$ following a Multi-Variate Normal distribution
@ -172,6 +177,7 @@ where the sampling from the standard Multi-Array Normal is done by sampling all
\section{Introduction}
\todo{rewrite this section to multi-variate arrays (tensors)}
We assume the model
\begin{displaymath}
\mat{X} = \mat{\mu} + \mat{\beta}\mat{f}_y \t{\mat{\alpha}} + \mat{\epsilon}
@ -269,7 +275,7 @@ Now, substitution of $\d\mat{r}_i$ into \eqref{eq:deriv1} gives the gradients (n
\end{align*}
These quantities are very verbose as well as completely unusable for an implementation. By detailed analysis of the gradients we see that the main parts are only element permutations with a high sparsity. By defining the following compact matrix
\begin{equation}\label{eq:permTransResponse}
\mat G = \vec^{-1}_{q r}\bigg(\Big( \sum_{i = 1}^n \vec\mat{f}_{y_i}\otimes \widehat{\mat\Delta}^{-1}\mat{r}_i \Big)_{\pi(i)}\bigg)_{i = 1}^{p q k r}{\color{gray}\qquad(q r \times p k)}
\mat G = \vec^{-1}_{q r}\bigg(\Big( \sum_{j = 1}^n \vec\mat{f}_{y_j}\otimes \widehat{\mat\Delta}^{-1}\mat{r}_j \Big)_{\pi(i)}\bigg)_{i = 1}^{p q k r}{\color{gray}\qquad(q r \times p k)}
\end{equation}
with $\pi$ being a permutation of $p q k r$ elements corresponding to permuting the axis of a 4D tensor of dimensions $p\times q\times k\times r$ by $(2, 4, 1, 3)$. As a generalization of transposition this leads to a rearrangement of the elements corresponding to the permuted 4D tensor with dimensions $q\times r\times p\times k$ which is then vectorized and reshaped into a matrix of dimensions $q r \times p k$. With $\mat G$ the gradients simplify to \todo{validate this mathematically}
\begin{align*}
@ -285,94 +291,161 @@ with $\pi$ being a permutation of $p q k r$ elements corresponding to permuting
%%% Kronecker Covariance Structure %%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Kronecker Covariance Structure}
Now we assume the residuals covariance has the form $\mat\Delta = \mat\Delta_1\otimes\mat\Delta_2$ where $\mat\Delta_1$, $\mat\Delta_2$ are $q\times q$, $p\times p$ covariance matrices, respectively. This is analog to the case that $\mat{R}_i$'s are i.i.d. Matrix Normal distribution
As before we let the sample model for tensor valued opbservations and responses or oder $r$ be
\begin{displaymath}
\mat{R}_i = \mat{X}_i - \mat\mu - \mat\beta\mat{f}_{y_i}\t{\mat\alpha} \sim \mathcal{MN}_{p\times q}(\mat 0, \mat\Delta_2, \mat\Delta_1).
\ten{X} = \ten{\mu} + \ten{F}\times_{j\in[r]}\alpha_j + \ten{\epsilon}
\end{displaymath}
The density of the Matrix Normal (with mean zero) is equivalent to the vectorized quantities being multivariate normal distributed with Kronecker structured covariance
but the error tensor $\ten{\epsilon}\sim\mathcal{TN}(0, \mat{\Delta}_1, ..., \mat{\Delta}_r)$ is Tensor Normal distributed with mean zero and covariance matrices $\mat{\Delta}_1, ..., \mat{\Delta}_r$.
The sample model for $n$ observations has the same form with an additional sample axis in the last mode of $\ten{X}$ and $\ten{Y}$ with dimensions $p_1\times ...\times p_r\times n$ and $q_1\times ...\times q_r\times n$, respectively. Let the residual tensor be
\begin{displaymath}
\ten{R} = \ten{X} - \ten{\mu} + \ten{F}\times_{j\in[r]}\alpha_j.
\end{displaymath}
By the definition of the Tensor Normal, using the notation $p_{\lnot j} = \prod_{k\neq j}p_j$, we get for observations $\ten{X}, \ten{F}$ the log-likelihood in terms of the residuals as
\begin{displaymath}
l = -\frac{n p}{2}\log 2\pi
-\sum_{j = 1}^r \frac{n p_{\lnot j}}{2}\log|\mat{\Delta}_j|
-\frac{1}{2}\langle \ten{R}\times_{j\in[r]}\mat{\Delta}_j^{-1}, \ten{R} \rangle.
\end{displaymath}
Note that the log-likelihood depends on the covariance matrices $\mat{\Delta}_j$, $j = 1, ..., r$ as well as the mean $\mu$ and the parameter matrices $\mat{\alpha}_j$, $j = 1, ..., r$ through the residuals $\ten{R}$.
\subsection{MLE estimates}
For deriving the MLE estimates we compute the differential of the log-likelihood given the data as
\begin{displaymath}
\d l =
-\sum_{j = 1}^r \frac{n p_{\lnot j}}{2}\d\log|{\mat{\Delta}}_j|
-\frac{1}{2}\sum_{j = 1}^r\langle {\ten{R}}\times_{k\in[r]\backslash j}{\mat{\Delta}}_k^{-1}\times_j\d{\mat{\Delta}}^{-1}_j \rangle
-\langle {\ten{R}}\times_{j\in[r]}{\mat{\Delta}}_j^{-1}, \d{\ten{R}} \rangle.
\end{displaymath}
Using $\d\log|\mat{A}| = \tr(\mat{A}^{-1}\d\mat{A})$ and $\d\mat{A}^{-1} = -\mat{A}^{-1}(\d\mat{A})\mat{A}^{-1}$ as well as $\langle\ten{A}, \ten{B}\rangle = \tr(\ten{A}_{(j)}\t{\ten{B}_{(j)}})$ for any $j = 1, ..., r$ we get the differential of the estimated log-likelihood as
\begin{align*}
f(\mat R)
&= \frac{1}{\sqrt{(2\pi)^{p q}|\mat\Delta|}}\exp\left(-\frac{1}{2}\t{\vec(\mat{R})} \mat\Delta^{-1}\vec(\mat{R})\right) \\
&= \frac{1}{(2\pi)^{p q / 2}|\mat\Delta_1|^{p / 2}|\mat\Delta_2|^{q / 2}}\exp\left(-\frac{1}{2}\tr(\mat\Delta_1^{-1}\t{\mat{R}}\mat\Delta_2^{-1}\mat{R})\right)
\d \hat{l}
&=
-\sum_{j = 1}^r \frac{n p_{\lnot j}}{2}\tr(\widehat{\mat{\Delta}}_j^{-1}\d\widehat{\mat{\Delta}}_j)
-\frac{1}{2}\sum_{j = 1}^r\tr\!\Big((\d{\widehat{\mat{\Delta}}}^{-1}_j)({\ten{R}}\times_{k\in[r]\backslash j}{\widehat{\mat{\Delta}}}_k^{-1})_{(j)}\t{\ten{R}_{(j)}}\Big)
-\langle {\ten{R}}\times_{j\in[r]}{\widehat{\mat{\Delta}}}_j^{-1}, \d{\ten{R}} \rangle \\
&=
-\frac{1}{2}\sum_{j = 1}^r \tr\left(\Big(
n p_{\lnot j}\mat{I}_{p_j} - \widehat{\mat{\Delta}}_j^{-1}(\ten{R}\times_{k\in[r]\backslash j}{\widehat{\mat{\Delta}}}_k^{-1})_{(j)}\t{\ten{R}_{(j)}}
\right)\widehat{\mat{\Delta}}_j^{-1}\d\widehat{\mat{\Delta}}_j\Big)
-\langle \ten{R}\times_{j\in[r]}{\widehat{\mat{\Delta}}}_j^{-1}, \d\ten{R} \rangle.
\end{align*}
which leads for given data to the log-likelihood
\begin{displaymath}
l(\mat{\mu}, \mat\Delta_1, \mat\Delta_2) =
-\frac{n p q}{2}\log 2\pi
-\frac{n p}{2}\log|\mat{\Delta}_1|
-\frac{n q}{2}\log|\mat{\Delta}_2|
-\frac{1}{2}\sum_{i = 1}^n \tr(\mat\Delta_1^{-1}\t{\mat{R}_i}\mat\Delta_2^{-1}\mat{R}_i).
\end{displaymath}
\subsection{MLE covariance estimates}
Out first order of business is to derive the MLE estimated of the covariance matrices $\mat\Delta_1$, $\mat\Delta_2$ (the mean estimate $\widehat{\mat\mu}$ is trivial). Therefore, we look at the differentials with respect to changes in the covariance matrices as
With $\ten{R}$ not dependent on the $\widehat{\mat{\Delta}}_j$'s we get the covariance MLE estimates
\begin{align*}
\d l(\mat\Delta_1, \mat\Delta_2) &=
-\frac{n p}{2}\d\log|\mat{\Delta}_1|
-\frac{n q}{2}\d\log|\mat{\Delta}_2|
-\frac{1}{2}\sum_{i = 1}^n
\tr( (\d\mat\Delta_1^{-1})\t{\mat{R}_i}\mat\Delta_2^{-1}\mat{R}_i
+ \mat\Delta_1^{-1}\t{\mat{R}_i}(\d\mat\Delta_2^{-1})\mat{R}_i) \\
&=
-\frac{n p}{2}\tr\mat{\Delta}_1^{-1}\d\mat{\Delta}_1
-\frac{n q}{2}\tr\mat{\Delta}_2^{-1}\d\mat{\Delta}_2 \\
&\qquad\qquad
+\frac{1}{2}\sum_{i = 1}^n
\tr( \mat\Delta_1^{-1}(\d\mat\Delta_1)\mat\Delta_1^{-1}\t{\mat{R}_i}\mat\Delta_2^{-1}\mat{R}_i
+ \mat\Delta_1^{-1}\t{\mat{R}_i}\mat\Delta_2^{-1}(\d\mat\Delta_2)\mat\Delta_2^{-1}\mat{R}_i) \\
&= \frac{1}{2}\tr\!\Big(\Big(
-n p \mat{I}_q + \mat\Delta_1^{-1}\sum_{i = 1}^n \t{\mat{R}_i}\mat\Delta_2^{-1}\mat{R}_i
\Big)\mat{\Delta}_1^{-1}\d\mat{\Delta}_1\Big) \\
&\qquad\qquad
+ \frac{1}{2}\tr\!\Big(\Big(
-n q \mat{I}_p + \mat\Delta_2^{-1}\sum_{i = 1}^n \mat{R}_i\mat\Delta_1^{-1}\t{\mat{R}_i}
\Big)\mat{\Delta}_2^{-1}\d\mat{\Delta}_2\Big) \overset{!}{=} 0.
\widehat{\mat{\Delta}}_j &= \frac{1}{n p_{\lnot j}}(\ten{R}\times_{k\in[r]\backslash j}{\widehat{\mat{\Delta}}}_k^{-1})_{(j)}\t{\ten{R}_{(j)}}
\qquad{\color{gray}p_j\times p_j} && j = 1, ..., r.
\end{align*}
Setting $\d l$ to zero yields the MLE estimates as
\begin{displaymath}
\widehat{\mat{\mu}} = \overline{\mat X}{\color{gray}\quad(p\times q)}, \qquad
\widehat{\mat\Delta}_1 = \frac{1}{n p}\sum_{i = 1}^n \t{\mat{R}_i}\widehat{\mat\Delta}_2^{-1}\mat{R}_i{\color{gray}\quad(q\times q)}, \qquad
\widehat{\mat\Delta}_2 = \frac{1}{n q}\sum_{i = 1}^n \mat{R}_i\widehat{\mat\Delta}_1^{-1}\t{\mat{R}_i}{\color{gray}\quad(p\times p)}.
\end{displaymath}
Next, analog to above, we take the estimated log-likelihood and derive gradients with respect to $\mat{\alpha}$, $\mat{\beta}$.
The estimated log-likelihood derives by replacing the unknown covariance matrices by there MLE estimates leading to
\begin{displaymath}
\hat{l}(\mat\alpha, \mat\beta) =
-\frac{n p q}{2}\log 2\pi
-\frac{n p}{2}\log|\widehat{\mat{\Delta}}_1|
-\frac{n q}{2}\log|\widehat{\mat{\Delta}}_2|
-\frac{1}{2}\sum_{i = 1}^n \tr(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\mat{R}_i)
\end{displaymath}
and its differential
\begin{displaymath}
\d\hat{l}(\mat\alpha, \mat\beta) =
-\frac{n p}{2}\d\log|\widehat{\mat{\Delta}}_1|
-\frac{n q}{2}\d\log|\widehat{\mat{\Delta}}_2|
-\frac{1}{2}\sum_{i = 1}^n \d\tr(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\mat{R}_i).
\end{displaymath}
We first take a closer look at the sum. After a bit of algebra using $\d\mat A^{-1} = -\mat A^{-1}(\d\mat A)\mat A^{-1}$ and the definitions of $\widehat{\mat\Delta}_1$, $\widehat{\mat\Delta}_2$ the sum can be rewritten
\begin{displaymath}
\frac{1}{2}\sum_{i = 1}^n \d\tr(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\mat{R}_i)
= \sum_{i = 1}^n \tr(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\d\mat{R}_i)
- \frac{np}{2}\d\log|\widehat{\mat\Delta}_1|
- \frac{nq}{2}\d\log|\widehat{\mat\Delta}_2|.
\end{displaymath}
This means that most of the derivative cancels out and we get
as well as gradients (even though they are not realy used, except in the case of a pure gradient based estimation procedure which might ease the estimation burden as all the MLE estimates are cross dependent)
\begin{align*}
\d\hat{l}(\mat\alpha, \mat\beta)
&= \sum_{i = 1}^n \tr(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\d\mat{R}_i) \\
&= \sum_{i = 1}^n \tr(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}((\d\mat\beta)\mat{f}_{y_i}\t{\mat\alpha} + \mat\beta\mat{f}_{y_i}\t{(\d\mat\alpha}))) \\
&= \sum_{i = 1}^n \t{\vec(\widehat{\mat{\Delta}}_2^{-1}\mat{R}_i\widehat{\mat{\Delta}}_1^{-1}\mat\alpha\t{\mat{f}_{y_i}})}\d\vec\mat\beta
+ \sum_{i = 1}^n \t{\vec(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\mat\beta\mat{f}_{y_i})}\d\vec\mat\alpha
\nabla_{\widehat{\mat{\Delta}}_j}\hat{l} &= \frac{1}{2}\widehat{\mat{\Delta}}_j^{-1}\big(
\ten{R}_{(j)}\t{(\ten{R}\times_{k\in[r]\backslash j}{\widehat{\mat{\Delta}}}_k^{-1})_{(j)}}\widehat{\mat{\Delta}}_j^{-1} - n p_{\lnot j}\mat{I}_{p_j}
\big) \\
&= \frac{1}{2}\widehat{\mat{\Delta}}_j^{-1}\big(
\ten{R}_{(j)}\t{(\ten{R}\times_{k\in[r]}{\widehat{\mat{\Delta}}}_k^{-1})_{(j)}} - n p_{\lnot j}\mat{I}_{p_j}
\big)
\qquad{\color{gray}p_j\times p_j} && j = 1, ..., r.
\end{align*}
which means the gradients are
We continue by substitution of the covariance estimates and get
\begin{align*}
\nabla_{\mat\alpha}\hat{l}(\mat\alpha, \mat\beta)
&= \sum_{i = 1}^n \widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\mat\beta\mat{f}_{y_i}
= (\ten{R}\ttm[3]\widehat{\mat{\Delta}}_1^{-1}\ttm[2]\widehat{\mat{\Delta}}_2^{-1})_{(3)}\t{(\ten{F}\ttm[2]\mat\beta)_{(3)}}\\
\nabla_{\mat\beta}\hat{l}(\mat\alpha, \mat\beta)
&= \sum_{i = 1}^n \widehat{\mat{\Delta}}_2^{-1}\mat{R}_i\widehat{\mat{\Delta}}_1^{-1}\mat\alpha\t{\mat{f}_{y_i}}
= (\ten{R}\ttm[3]\widehat{\mat{\Delta}}_1^{-1}\ttm[2]\widehat{\mat{\Delta}}_2^{-1})_{(2)}\t{(\ten{F}\ttm[3]\mat\alpha)_{(2)}}
\d\hat{l} &= -\langle \ten{R}\times_{j\in[r]}{\widehat{\mat{\Delta}}}_j^{-1}, \d\ten{R} \rangle \\
&= \sum_{j = 1}^r \langle \ten{R}\times_{j\in[r]}{\widehat{\mat{\Delta}}}_j^{-1}, \ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k\times_j\d\widehat{\mat{\alpha}}_j \rangle \\
&= \sum_{j = 1}^r \tr\big( (\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}\t{(\ten{R}\times_{j\in[r]}{\widehat{\mat{\Delta}}}_j^{-1})_{(j)}}\d\widehat{\mat{\alpha}}_j \big).
\end{align*}
Through that the gradient for all the parameter matrices is
\begin{align*}
\nabla_{\widehat{\mat{\alpha}}_j}\hat{l} &= (\ten{R}\times_{j\in[r]}{\widehat{\mat{\Delta}}}_j^{-1})_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}}
\qquad{\color{gray}p_j\times q_j} && j = 1, ..., r.
\end{align*}
% Now we assume the residuals covariance has the form $\mat\Delta = \mat\Delta_1\otimes\mat\Delta_2$ where $\mat\Delta_1$, $\mat\Delta_2$ are $q\times q$, $p\times p$ covariance matrices, respectively. This is analog to the case that $\mat{R}_i$'s are i.i.d. Matrix Normal distribution
% \begin{displaymath}
% \mat{R}_i = \mat{X}_i - \mat\mu - \mat\beta\mat{f}_{y_i}\t{\mat\alpha} \sim \mathcal{MN}_{p\times q}(\mat 0, \mat\Delta_2, \mat\Delta_1).
% \end{displaymath}
% The density of the Matrix Normal (with mean zero) is equivalent to the vectorized quantities being multivariate normal distributed with Kronecker structured covariance
% \begin{align*}
% f(\mat R)
% &= \frac{1}{\sqrt{(2\pi)^{p q}|\mat\Delta|}}\exp\left(-\frac{1}{2}\t{\vec(\mat{R})} \mat\Delta^{-1}\vec(\mat{R})\right) \\
% &= \frac{1}{(2\pi)^{p q / 2}|\mat\Delta_1|^{p / 2}|\mat\Delta_2|^{q / 2}}\exp\left(-\frac{1}{2}\tr(\mat\Delta_1^{-1}\t{\mat{R}}\mat\Delta_2^{-1}\mat{R})\right)
% \end{align*}
% which leads for given data to the log-likelihood
% \begin{displaymath}
% l(\mat{\mu}, \mat\Delta_1, \mat\Delta_2) =
% -\frac{n p q}{2}\log 2\pi
% -\frac{n p}{2}\log|\mat{\Delta}_1|
% -\frac{n q}{2}\log|\mat{\Delta}_2|
% -\frac{1}{2}\sum_{i = 1}^n \tr(\mat\Delta_1^{-1}\t{\mat{R}_i}\mat\Delta_2^{-1}\mat{R}_i).
% \end{displaymath}
% \subsection{MLE covariance estimates}
% Out first order of business is to derive the MLE estimated of the covariance matrices $\mat\Delta_1$, $\mat\Delta_2$ (the mean estimate $\widehat{\mat\mu}$ is trivial). Therefore, we look at the differentials with respect to changes in the covariance matrices as
% \begin{align*}
% \d l(\mat\Delta_1, \mat\Delta_2) &=
% -\frac{n p}{2}\d\log|\mat{\Delta}_1|
% -\frac{n q}{2}\d\log|\mat{\Delta}_2|
% -\frac{1}{2}\sum_{i = 1}^n
% \tr( (\d\mat\Delta_1^{-1})\t{\mat{R}_i}\mat\Delta_2^{-1}\mat{R}_i
% + \mat\Delta_1^{-1}\t{\mat{R}_i}(\d\mat\Delta_2^{-1})\mat{R}_i) \\
% &=
% -\frac{n p}{2}\tr\mat{\Delta}_1^{-1}\d\mat{\Delta}_1
% -\frac{n q}{2}\tr\mat{\Delta}_2^{-1}\d\mat{\Delta}_2 \\
% &\qquad\qquad
% +\frac{1}{2}\sum_{i = 1}^n
% \tr( \mat\Delta_1^{-1}(\d\mat\Delta_1)\mat\Delta_1^{-1}\t{\mat{R}_i}\mat\Delta_2^{-1}\mat{R}_i
% + \mat\Delta_1^{-1}\t{\mat{R}_i}\mat\Delta_2^{-1}(\d\mat\Delta_2)\mat\Delta_2^{-1}\mat{R}_i) \\
% &= \frac{1}{2}\tr\!\Big(\Big(
% -n p \mat{I}_q + \mat\Delta_1^{-1}\sum_{i = 1}^n \t{\mat{R}_i}\mat\Delta_2^{-1}\mat{R}_i
% \Big)\mat{\Delta}_1^{-1}\d\mat{\Delta}_1\Big) \\
% &\qquad\qquad
% + \frac{1}{2}\tr\!\Big(\Big(
% -n q \mat{I}_p + \mat\Delta_2^{-1}\sum_{i = 1}^n \mat{R}_i\mat\Delta_1^{-1}\t{\mat{R}_i}
% \Big)\mat{\Delta}_2^{-1}\d\mat{\Delta}_2\Big) \overset{!}{=} 0.
% \end{align*}
% Setting $\d l$ to zero yields the MLE estimates as
% \begin{displaymath}
% \widehat{\mat{\mu}} = \overline{\mat X}{\color{gray}\quad(p\times q)}, \qquad
% \widehat{\mat\Delta}_1 = \frac{1}{n p}\sum_{i = 1}^n \t{\mat{R}_i}\widehat{\mat\Delta}_2^{-1}\mat{R}_i{\color{gray}\quad(q\times q)}, \qquad
% \widehat{\mat\Delta}_2 = \frac{1}{n q}\sum_{i = 1}^n \mat{R}_i\widehat{\mat\Delta}_1^{-1}\t{\mat{R}_i}{\color{gray}\quad(p\times p)}.
% \end{displaymath}
% Next, analog to above, we take the estimated log-likelihood and derive gradients with respect to $\mat{\alpha}$, $\mat{\beta}$.
% The estimated log-likelihood derives by replacing the unknown covariance matrices by there MLE estimates leading to
% \begin{displaymath}
% \hat{l}(\mat\alpha, \mat\beta) =
% -\frac{n p q}{2}\log 2\pi
% -\frac{n p}{2}\log|\widehat{\mat{\Delta}}_1|
% -\frac{n q}{2}\log|\widehat{\mat{\Delta}}_2|
% -\frac{1}{2}\sum_{i = 1}^n \tr(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\mat{R}_i)
% \end{displaymath}
% and its differential
% \begin{displaymath}
% \d\hat{l}(\mat\alpha, \mat\beta) =
% -\frac{n p}{2}\d\log|\widehat{\mat{\Delta}}_1|
% -\frac{n q}{2}\d\log|\widehat{\mat{\Delta}}_2|
% -\frac{1}{2}\sum_{i = 1}^n \d\tr(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\mat{R}_i).
% \end{displaymath}
% We first take a closer look at the sum. After a bit of algebra using $\d\mat A^{-1} = -\mat A^{-1}(\d\mat A)\mat A^{-1}$ and the definitions of $\widehat{\mat\Delta}_1$, $\widehat{\mat\Delta}_2$ the sum can be rewritten
% \begin{displaymath}
% \frac{1}{2}\sum_{i = 1}^n \d\tr(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\mat{R}_i)
% = \sum_{i = 1}^n \tr(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\d\mat{R}_i)
% - \frac{np}{2}\d\log|\widehat{\mat\Delta}_1|
% - \frac{nq}{2}\d\log|\widehat{\mat\Delta}_2|.
% \end{displaymath}
% This means that most of the derivative cancels out and we get
% \begin{align*}
% \d\hat{l}(\mat\alpha, \mat\beta)
% &= \sum_{i = 1}^n \tr(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\d\mat{R}_i) \\
% &= \sum_{i = 1}^n \tr(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}((\d\mat\beta)\mat{f}_{y_i}\t{\mat\alpha} + \mat\beta\mat{f}_{y_i}\t{(\d\mat\alpha}))) \\
% &= \sum_{i = 1}^n \t{\vec(\widehat{\mat{\Delta}}_2^{-1}\mat{R}_i\widehat{\mat{\Delta}}_1^{-1}\mat\alpha\t{\mat{f}_{y_i}})}\d\vec\mat\beta
% + \sum_{i = 1}^n \t{\vec(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\mat\beta\mat{f}_{y_i})}\d\vec\mat\alpha
% \end{align*}
% which means the gradients are
% \begin{align*}
% \nabla_{\mat\alpha}\hat{l}(\mat\alpha, \mat\beta)
% &= \sum_{i = 1}^n \widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\mat\beta\mat{f}_{y_i}
% = (\ten{R}\ttm[3]\widehat{\mat{\Delta}}_1^{-1}\ttm[2]\widehat{\mat{\Delta}}_2^{-1})_{(3)}\t{(\ten{F}\ttm[2]\mat\beta)_{(3)}}\\
% \nabla_{\mat\beta}\hat{l}(\mat\alpha, \mat\beta)
% &= \sum_{i = 1}^n \widehat{\mat{\Delta}}_2^{-1}\mat{R}_i\widehat{\mat{\Delta}}_1^{-1}\mat\alpha\t{\mat{f}_{y_i}}
% = (\ten{R}\ttm[3]\widehat{\mat{\Delta}}_1^{-1}\ttm[2]\widehat{\mat{\Delta}}_2^{-1})_{(2)}\t{(\ten{F}\ttm[3]\mat\alpha)_{(2)}}
% \end{align*}
\paragraph{Comparison to the general case:} There are two main differences, first the general case has a closed form solution for the gradient due to the explicit nature of the MLE estimate of $\widehat{\mat\Delta}$ compared to the mutually dependent MLE estimates $\widehat{\mat\Delta}_1$, $\widehat{\mat\Delta}_2$. On the other hand the general case has dramatically bigger dimensions of the covariance matrix ($p q \times p q$) compared to the two Kronecker components with dimensions $q \times q$ and $p \times p$. This means that in the general case there is a huge performance penalty in the dimensions of $\widehat{\mat\Delta}$ while in the other case an extra estimation is required to determine $\widehat{\mat\Delta}_1$, $\widehat{\mat\Delta}_2$.
@ -524,7 +597,7 @@ and therefore the gradients
\section{Thoughts on initial value estimation}
\todo{This section uses an alternative notation as it already tries to generalize to general multi-dimensional arrays. Furthermore, one of the main differences is that the observation are indexed in the \emph{last} mode. The benefit of this is that the mode product and parameter matrix indices match not only in the population model but also in sample versions.}
Let $\ten{X}, \ten{F}$ be order (rank) $r$ tensors of dimensions $p_1\times ... \times p_r$ and $q_1\times ... \times q_r$, respectively. Also denote the error tensor $\epsilon$ of the same order and dimensions as $\ten{X}$. The considered model for the $i$'th observation is
Let $\ten{X}, \ten{F}$ be order $r$ tensors of dimensions $p_1\times ... \times p_r$ and $q_1\times ... \times q_r$, respectively. Also denote the error tensor $\epsilon$ of the same order and dimensions as $\ten{X}$. The considered model for the $i$'th observation is
\begin{displaymath}
\ten{X}_i = \ten{\mu} + \ten{F}_i\times\{ \mat{\alpha}_1, ..., \mat{\alpha}_r \} + \ten{\epsilon}_i
\end{displaymath}
@ -532,7 +605,7 @@ where we assume $\ten{\epsilon}_i$ to be i.i.d. mean zero tensor normal distribu
\begin{displaymath}
\ten{X} = \ten{\mu} + \ten{F}\times\{ \mat{\alpha}_1, ..., \mat{\alpha}_r, \mat{I}_n \} + \ten{\epsilon}
\end{displaymath}
which is almost identical as the observations $\ten{X}_i, \ten{F}_i$ are stacked on an addition $r + 1$ mode leading to response, predictor and error tensors $\ten{X}, \ten{F}$ of order (rank) $r + 1$ and dimensions $p_1\times...\times p_r\times n$ for $\ten{X}, \ten{\epsilon}$ and $q_1\times...\times q_r\times n$ for $\ten{F}$.
which is almost identical as the observations $\ten{X}_i, \ten{F}_i$ are stacked on an addition $r + 1$ mode leading to response, predictor and error tensors $\ten{X}, \ten{F}$ of order $r + 1$ and dimensions $p_1\times...\times p_r\times n$ for $\ten{X}, \ten{\epsilon}$ and $q_1\times...\times q_r\times n$ for $\ten{F}$.
In the following we assume w.l.o.g that $\ten{\mu} = 0$, as if this is not true we simply replace $\ten{X}_i$ with $\ten{X}_i - \ten{\mu}$ for $i = 1, ..., n$ before collecting all the observations in the response tensor $\ten{X}$.