tensor_predictors/LaTeX/main.tex

\documentclass[a4paper, 10pt]{article}

\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{fullpage}
\usepackage{amsmath, amssymb, amstext, amsthm}
\usepackage{bm}                         % \boldsymbol and italic corrections, ...
\usepackage[pdftex]{hyperref}
\usepackage{makeidx}                    % Index (Symbols, Names, ...)
\usepackage{xcolor, graphicx}           % colors and including images
\usepackage{tikz}
\usepackage[
    % backend=bibtex,
    style=authoryear-comp
]{biblatex}

% Document meta into
\title{Derivation of Gradient Descent Algorithm for K-PIR}
\author{Daniel Kapla}
\date{November 24, 2021}
% Set PDF title, author and creator.
\AtBeginDocument{
    \hypersetup{
        pdftitle = {Derivation of Gradient Descent Algorithm for K-PIR},
        pdfauthor = {Daniel Kapla},
        pdfcreator = {\pdftexbanner}
    }
}

\makeindex

% Bibliography resource(s)
\addbibresource{main.bib}

% Setup environments
% Theorem, Lemma
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{example}{Example}
% Definition
\theoremstyle{definition}
\newtheorem{defn}{Definition}
% Remark
\theoremstyle{remark}
\newtheorem{remark}{Remark}

% Define math macros
\newcommand{\mat}[1]{\boldsymbol{#1}}
\newcommand{\ten}[1]{\mathcal{#1}}
\renewcommand{\vec}{\operatorname{vec}}
\newcommand{\dist}{\operatorname{dist}}
\newcommand{\rank}{\operatorname{rank}}
\DeclareMathOperator{\kron}{\otimes}    % Kronecker Product
\DeclareMathOperator{\hada}{\odot}      % Hadamard Product
\newcommand{\ttm}[1][n]{\times_{#1}}    % n-mode product (Tensor Times Matrix)
\DeclareMathOperator{\df}{\operatorname{df}}
\DeclareMathOperator{\tr}{\operatorname{tr}}
\DeclareMathOperator{\var}{Var}
\DeclareMathOperator{\cov}{Cov}
\DeclareMathOperator{\E}{\operatorname{\mathbb{E}}}
% \DeclareMathOperator{\independent}{{\bot\!\!\!\bot}}
\DeclareMathOperator*{\argmin}{{arg\,min}}
\DeclareMathOperator*{\argmax}{{arg\,max}}
\newcommand{\D}{\textnormal{D}}
\renewcommand{\d}{\textnormal{d}}
\renewcommand{\t}[1]{{#1^{\prime}}}
\newcommand{\pinv}[1]{{#1^{\dagger}}} % `Moore-Penrose pseudoinverse`
\newcommand{\todo}[1]{{\color{red}TODO: #1}}

% \DeclareFontFamily{U}{mathx}{\hyphenchar\font45}
% \DeclareFontShape{U}{mathx}{m}{n}{
%       <5> <6> <7> <8> <9> <10>
%       <10.95> <12> <14.4> <17.28> <20.74> <24.88>
%       mathx10
%       }{}
% \DeclareSymbolFont{mathx}{U}{mathx}{m}{n}
% \DeclareMathSymbol{\bigtimes}{1}{mathx}{"91}

\begin{document}

\maketitle

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%                               Preliminary                                %%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Notation}
Vectors are write as boldface lowercase letters (e.g. $\mat a$, $\mat b$), matrices use boldface uppercase or Greek letters (e.g. $\mat A$, $\mat B$, $\mat\alpha$, $\mat\Delta$). The identity matrix of dimensions $p\times p$ is denoted by $\mat{I}_p$ and the commutation matrix as $\mat{K}_{p, q}$ or $\mat{K}_p$ is case of $p = q$. Tensors, meaning multi-dimensional arrays of order at least 3, use uppercase calligraphic letters (e.g. $\ten{A}$, $\ten{B}$, $\ten{X}$, $\ten{Y}$, $\ten{F}$). Boldface indices (e.g. $\mat{i}, \mat{j}, \mat{k}$) denote multi-indices $\mat{i} = (i_1, ..., i_r)\in[\mat{d}]$ where the bracket notation is a shorthand for $[r] = \{1, ..., r\}$ which in conjunction with a multi-index as argument means $[\mat{d}] = [d_1]\times ... \times[d_K]$.

Let $\ten{A} = (a_{i_1,...,i_r})\in\mathbb{R}^{d_1\times ...\times d_r}$ be an order\footnote{Also called rank, therefore the variable name $r$, but this term is \emph{not} used as it leads to confusion with the rank as in ``the rank of a matrix''.} $r$ tensor where $r\in\mathbb{N}$ is the number of modes or axis of $\ten{A}$. For matrices $\mat{B}_k\in\mathbb{R}^{p_k\times d_k}$ with $k\in[r] = \{1, 2, ..., r\}$ the \emph{multi-linear multiplication} is defined element wise as
\begin{displaymath}
    (\ten{A}\times\{\mat{B}_1, ..., \mat{B}_r\})_{j_1, ..., j_r} = \sum_{i_1, ..., i_r = 1}^{d_1, ..., d_r} a_{i_1, ..., i_r}(B_{1})_{j_1, i_1} \cdots (B_{r})_{j_r, i_r}
\end{displaymath}
which results in an order $r$ tensor of dimensions $p_1\times ...\times p_k)$. With this the \emph{$k$-mode product} between the tensor $\ten{A}$ with the matrix $\mat{B}_k$ is given by
\begin{displaymath}
    \mat{A}\times_k\mat{B}_k = \ten{A}\times\{\mat{I}_{d_1}, ..., \mat{I}_{d_{k-1}}, \mat{B}_{k}, \mat{I}_{d_{k+1}}, ..., \mat{I}_{d_r}\}.
\end{displaymath}
Furthermore, the notation $\ten{A}\times_{k\in S}$ is a short hand for writing the iterative application if the mode product for all indices in $S\subset[r]$. For example $\ten{A}\times_{k\in\{2, 5\}}\mat{B}_k = \ten{A}\times_2\mat{B}_2\times_5\mat{B}_5$. By only allowing $S$ to be a set this notation is unambiguous because the mode products commutes for different modes $j\neq k\Rightarrow\ten{A}\times_j\mat{B}_j\times_k\mat{B}_k = \ten{A}\times_k\mat{B}_k\times_j\mat{B}_j$.

The \emph{inner product} between two tensors of the same order and dimensions is
\begin{displaymath}
    \langle\ten{A}, \ten{B}\rangle = \sum_{i_1, ..., i_r} a_{i_1, ..., i_r}b_{i_1, ..., i_r}
\end{displaymath}
with which the \emph{Frobenius Norm} $\|\ten{A}\|_F = \sqrt{\langle\ten{A}, \ten{A}\rangle}$. Of interest is also the \emph{maximum norm} $\|\ten{A}\|_{\infty} = \max_{i_1, ..., i_K} a_{i_1, ..., i_K}$. Furthermore, the Frobenius and maximum norm are also used for matrices while for a vector $\mat{a}$ the \emph{2 norm} is $\|\mat{a}\|_2 = \sqrt{\langle\mat{a}, \mat{a}\rangle}$.

Matrices and tensor can be \emph{vectorized} by the \emph{vectorization} operator $\vec$. For tensors of order at least $2$ the \emph{flattening} (or \emph{unfolding} or \emph{matricization}) is a reshaping of the tensor into a matrix along an particular mode. For a tensor $\ten{A}$ of order $r$ and dimensions $d_1, ..., d_r$ the $k$-mode unfolding $\ten{A}_{(k)}$ is a $d_k\times \prod_{l=1, l\neq k}d_l$ matrix. For the tensor $\ten{A} = (a_{i_1,...,i_r})\in\mathbb{R}^{d_1, ..., d_r}$ the elements of the $k$ unfolded tensor $\ten{A}_{(k)}$ are
\begin{displaymath}
    (\ten{A}_{(k)})_{i_k, j} = a_{i_1, ..., i_r}\quad\text{ with }\quad j = 1 + \sum_{\substack{l = 1\\l \neq k}}^r (i_l - 1) \prod_{\substack{m = 1\\m\neq k}}^{l - 1}d_m.
\end{displaymath}

The rank of a tensor $\ten{A}$ of dimensions $d_1\times ...\times d_r$ is given by a vector $\rank{\ten{A}} = (a_1, ..., a_r)\in[d_1]\times...\times[d_r]$ where $a_k = \rank(\ten{A}_{(k)})$ is the usual matrix rank of the $k$ unfolded tensor.


{\color{red}$\mathcal{S}^p$, $\mathcal{S}_{+}^p$, $\mathcal{S}_{++}^p$ symmetric matrices of dimensions $p\times p$, or call it $\operatorname{Sym}(p)$}

{\color{red}The group of orthogonas matrices $O(p)$ of dim $p\times p$, where $O(p, q)$ are the $p\times q$ matrices (a.k.a. the Stiefel manifold)}


\section{Tensor Normal Distribution}
Let $\ten{X}$ be a multi-dimensional array random variable of order $r$ with dimensions $p_1\times ... \times p_r$ written as
\begin{displaymath}
    \ten{X}\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r).
\end{displaymath}
Its density is given by
\begin{displaymath}
    f(\ten{X}) = \Big( \prod_{i = 1}^r \sqrt{(2\pi)^{p_i}|\mat{\Delta}_i|^{p_{\lnot i}}} \Big)^{-1}
        \exp\!\left( -\frac{1}{2}\langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle \right)
\end{displaymath}
where $p_{\lnot i} = \prod_{j \neq i}p_j$. This is equivalent to the vectorized $\vec\ten{X}$ following a Multi-Variate Normal distribution
\begin{displaymath}
    \vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1)
\end{displaymath}
with $p = \prod_{i = 1}^r p_i$.

\begin{theorem}[Tensor Normal to Multi-Variate Normal equivalence]
    For a multi-dimensional random variable $\ten{X}$ of order $r$ with dimensions $p_1\times ..., p_r$. Let $\ten{\mu}$ be the mean of the same order and dimensions as $\ten{X}$ and the mode covariance matrices $\mat{\Delta}_i$ of dimensions $p_i\times p_i$ for $i = 1, ..., n$. Then the tensor normal distribution is equivalent to the multi-variate normal distribution by the relation
    \begin{displaymath}
        \ten{X}\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r)
            \qquad\Leftrightarrow\qquad
        \vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes ...\otimes \mat{\Delta}_1)
    \end{displaymath}
    where $p = \prod_{i = 1}^r p_i$.
\end{theorem}
\begin{proof}
    A straight forward way is to rewrite the Tensor Normal density as the density of a Multi-Variate Normal distribution depending on the vectorization of $\ten{X}$. First consider
    \begin{align*}
        \langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle
            &= \t{\vec(\ten{X} - \mu)}\vec((\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\}) \\
            &= \t{\vec(\ten{X} - \mu)}(\mat{\Delta}_r^{-1}\otimes ...\otimes\mat{\Delta}_1^{-1})\vec(\ten{X} - \mu) \\
            &= \t{(\vec\ten{X} - \vec\mu)}(\mat{\Delta}_r\otimes ...\otimes\mat{\Delta}_1)^{-1}(\vec\ten{X} - \vec\mu).
    \end{align*}
    Next, using a property of the determinant of a Kronecker product $|\mat{\Delta}_1\otimes\mat{\Delta}_2| = |\mat{\Delta}_1|^{p_2}|\mat{\Delta}_2|^{p_1}$ yields
    \begin{displaymath}
        |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1|
            = |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_2|^{p_1}|\mat{\Delta}_1|^{p_{\lnot 1}}
    \end{displaymath}
    where $p_{\lnot i} = \prod_{j \neq i}p_j$. By induction over $r$ the relation
    \begin{displaymath}
        |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1|
            = \prod_{i = 1}^r |\mat{\Delta}_i|^{p_{\lnot i}}
    \end{displaymath}
    holds for arbitrary order $r$. Substituting into the Tensor Normal density leads to
    \begin{align*}
        f(\ten{X}) = \Big( (2\pi)^p |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1| \Big)^{-1/2}
        \exp\!\left( -\frac{1}{2}\t{(\vec\ten{X} - \vec\mu)}(\mat{\Delta}_r\otimes ...\otimes\mat{\Delta}_1)^{-1}(\vec\ten{X} - \vec\mu) \right)
    \end{align*}
    which is the Multi-Variate Normal density of the $p$ dimensional vector $\vec\ten{X}$.
\end{proof}

When sampling from the Multi-Array Normal one way is to sample from the Multi-Variate Normal and then reshaping the result, but this is usually very inefficient because it requires to store the multi-variate covariance matrix which is very big. Instead, it is more efficient to sample $\ten{Z}$ as a tensor of the same shape as $\ten{X}$ with standard normal entries and then transform the $\ten{Z}$ to follow the Multi-Array Normal as follows
\begin{displaymath}
    \ten{Z}\sim\mathcal{TN}(0, \mat{I}_{p_1}, ..., \mat{I}_{p_r})
        \quad\Rightarrow\quad
    \ten{X} = \ten{Z}\times\{\mat{\Delta}_1^{1/2}, ..., \mat{\Delta}_r^{1/2}\} + \mu\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r).
\end{displaymath}
where the sampling from the standard Multi-Array Normal is done by sampling all of the elements of $\ten{Z}$ from a standard Normal.


\section{Introduction}
\todo{rewrite this section to multi-variate arrays (tensors)}
We assume the model
\begin{displaymath}
    \mat{X} = \mat{\mu} + \mat{\beta}\mat{f}_y \t{\mat{\alpha}} + \mat{\epsilon}
\end{displaymath}
where the dimensions of all the components are listed in Table~\ref{tab:dimensions}.
and its vectorized form
\begin{displaymath}
    \vec\mat{X} = \vec\mat{\mu} + (\mat{\alpha}\kron\mat{\beta})\vec\mat{f}_y + \vec\mat{\epsilon}
\end{displaymath}

\begin{table}[!htp]
    \centering
    \begin{minipage}{0.8\textwidth}
    \centering
    \begin{tabular}{l l}
        $\mat X, \mat\mu, \mat R, \mat\epsilon$     & $p\times q$ \\
        $\mat{f}_y$                                 & $k\times r$ \\
        $\mat\alpha$                                & $q\times r$ \\
        $\mat\beta$                                 & $p\times k$ \\
        $\mat\Delta$                                & $p q\times p q$ \\
        $\mat\Delta_1$                              & $q\times q$ \\
        $\mat\Delta_2$                              & $p\times p$ \\
        $\mat{r}$                                   & $p q\times 1$ \\
        \hline
        $\ten{X}, \ten{R}$                          & $n\times p\times q$ \\
        $\ten{F}$                                   & $n\times k\times r$ \\
    \end{tabular}
    \caption{\label{tab:dimensions}\small Summary listing of dimensions with the corresponding sample versions $\mat{X}_i, \mat{R}_i, \mat{r}_i, \mat{f}_{y_i}$ for $i = 1, ..., n$ as well as estimates $\widehat{\mat{\alpha}}, \widehat{\mat{\beta}}, \widehat{\mat\Delta}, \widehat{\mat\Delta}_1$ and $\widehat{\mat\Delta}_2$.}
    \end{minipage}
\end{table}

The log-likelihood $l$ given $n$ i.i.d. observations assuming that $\mat{X}_i\mid(Y = y_i)$ is normal distributed as
\begin{displaymath}
    \vec\mat{X}_i \sim \mathcal{N}_{p q}(\vec\mat\mu + (\mat\alpha\kron\mat\beta)\vec\mat{f}_{y_i}, \Delta)
\end{displaymath}
Replacing all unknown by there estimates gives the (estimated) log-likelihood
\begin{equation}\label{eq:log-likelihood-est}
    \hat{l}(\mat\alpha, \mat\beta) = -\frac{n q p}{2}\log 2\pi - \frac{n}{2}\log|\widehat{\mat\Delta}| - \frac{1}{2}\sum_{i = 1}^n \t{\mat{r}_i}\widehat{\mat\Delta}^{-1}\mat{r}_i
\end{equation}
where the residuals are
\begin{displaymath}
    \mat{r}_i = \vec\mat{X}_i - \vec\overline{\mat{X}} - (\mat\alpha\kron\mat\beta)\vec{\mat f}_{y_i}\qquad (p q \times 1)
\end{displaymath}
and the MLE estimate assuming $\mat\alpha, \mat\beta$ known for the covariance matrix $\widehat{\mat\Delta}$ as solution to the score equations is
\begin{equation}\label{eq:Delta}
    \widehat{\mat\Delta} = \frac{1}{n}\sum_{i = 1}^n \mat{r}_i \t{\mat{r}_i} \qquad(p q \times p q).
\end{equation}
Note that the log-likelihood estimate $\hat{l}$ only depends on $\mat\alpha, \mat\beta$. Next, we compute the gradient for $\mat\alpha$ and $\mat\beta$ of $\hat{l}$ used to formulate a Gradient Descent base estimation algorithm for $\mat\alpha, \mat\beta$ as the previous algorithmic. The main reason is to enable an estimation for bigger dimensions of the $\mat\alpha, \mat\beta$ coefficients since the previous algorithm does \emph{not} solve the high run time problem for bigger dimensions.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%                                Derivative                                %%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Derivative of the Log-Likelihood}
Start with the general case of $\mat X_i|(Y_i = y_i)$ is multivariate normal distributed with the covariance $\mat\Delta$ being a $p q\times p q$ positive definite symmetric matrix \emph{without} an further assumptions. We have $i = 1, ..., n$ observations following
\begin{displaymath}
    \mat{r}_i = \vec(\mat X_i - \mat\mu - \mat\beta\mat{f}_{y_i}\t{\mat\alpha}) \sim \mathcal{N}_{p q}(\mat 0, \mat\Delta).
\end{displaymath}
The MLE estimates of $\mat\mu, \mat\Delta$ are
\begin{displaymath}
    \widehat{\mat\mu} = \overline{\mat X} = \frac{1}{n}\sum_{i = 1}^n \mat X_i {\color{gray}\qquad(p\times q)},
    \qquad \widehat{\mat\Delta} = \frac{1}{n}\sum_{i = 1}^n \mat r_i\t{\mat r_i} {\color{gray}\qquad(p q\times p q)}.
\end{displaymath}
Substitution of the MLE estimates into the log-likelihood $l(\mat\mu, \mat\Delta, \mat\alpha, \mat\beta)$ gives the estimated log-likelihood $\hat{l}(\mat\alpha, \mat\beta)$ as
\begin{displaymath}
    \hat{l}(\mat\alpha, \mat\beta) = -\frac{n q p}{2}\log 2\pi - \frac{n}{2}\log|\widehat{\mat\Delta}| - \frac{1}{2}\sum_{i = 1}^n \t{\mat{r}_i}\widehat{\mat\Delta}^{-1}\mat{r}_i.
\end{displaymath}
We are interested in the gradients $\nabla_{\mat\alpha}\hat{l}(\mat\alpha, \mat\beta)$, $\nabla_{\mat\beta}\hat{l}(\mat\alpha, \mat\beta)$ of the estimated log-likelihood. Therefore, we consider the differential of $\hat{l}$.
\begin{align}
    \d\hat{l}(\mat\alpha, \mat\beta)
        &= -\frac{n}{2}\log|\widehat{\mat{\Delta}}| - \frac{1}{2}\sum_{i = 1}^n \big(\t{(\d \mat{r}_i)}\widehat{\mat{\Delta}}^{-1} \mat{r}_i + \t{\mat{r}_i}(\d\widehat{\mat{\Delta}}^{-1}) \mat{r}_i + \t{\mat{r}_i}\widehat{\mat{\Delta}}^{-1} \d \mat{r}_i\big) \nonumber\\
        &= \underbrace{-\frac{n}{2}\log|\widehat{\mat{\Delta}}| - \frac{1}{2}\sum_{i = 1}^n \t{\mat{r}_i}(\d\widehat{\mat{\Delta}}^{-1}) \mat{r}_i}_{=\,0\text{ due to }\widehat{\mat{\Delta}}\text{ beeing the MLE}} \label{eq:deriv1}
            - \sum_{i = 1}^n \t{\mat{r}_i}\widehat{\mat{\Delta}}^{-1} \d \mat{r}_i.
\end{align}
The next step is to compute $\d \mat{r}_i$ which depends on both $\mat\alpha$ and $\mat\beta$
\begin{align*}
    \d\mat{r}_i(\mat\alpha, \mat\beta)
        &= -\d(\mat\alpha\kron \mat\beta)\vec\mat{f}_{y_i} \\
        &= -\vec\!\big( \mat{I}_{p q}\,\d(\mat\alpha\kron \mat\beta)\vec\mat{f}_{y_i} \big) \\
        &= -(\t{\vec(\mat{f}_{y_i})}\kron \mat{I}_{p q})\,\d\vec(\mat\alpha\kron \mat\beta) \\
\intertext{using the identity \ref{eq:vecKron}, to obtain vectorized differentials, gives}
    \dots
        &= -(\t{\vec(\mat{f}_{y_i})}\kron \mat{I}_{p q})(\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) \,\d(\vec \mat\alpha\kron \vec \mat\beta) \\
        &= -(\t{\vec(\mat{f}_{y_i})}\kron \mat{I}_{p q})(\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) \big((\d\vec \mat\alpha)\kron \vec \mat\beta + \vec \mat\alpha\kron (\d\vec \mat\beta)\big) \\
        &= -(\t{\vec(\mat{f}_{y_i})}\kron \mat{I}_{p q})(\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) \big(\mat{I}_{r q}(\d\vec \mat\alpha)\kron (\vec \mat\beta)\mat{I}_1 + (\vec \mat\alpha)\mat{I}_1\kron \mat{I}_{k p}(\d\vec \mat\beta)\big) \\
        &= -(\t{\vec(\mat{f}_{y_i})}\kron \mat{I}_{p q})(\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) \big((\mat{I}_{r q}\kron\vec \mat\beta)\d\vec \mat\alpha + (\vec \mat\alpha\kron \mat{I}_{k p})\d\vec \mat\beta\big)
\end{align*}
Now, substitution of $\d\mat{r}_i$ into \eqref{eq:deriv1} gives the gradients (not dimension standardized versions of $\D\hat{l}(\mat\alpha)$, $\D\hat{l}(\mat\beta)$) by identification of the derivatives from the differentials (see: \todo{appendix})
\begin{align*}
    \nabla_{\mat\alpha}\hat{l}(\mat\alpha, \mat\beta) &=
        \sum_{i = 1}^n (\t{\vec(\mat{f}_{y_i})}\kron\t{\mat{r}_i}\widehat{\mat\Delta}^{-1}) (\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) (\mat{I}_{r q}\kron\vec \mat\beta),
        {\color{gray}\qquad(q\times r)} \\
    \nabla_{\mat\beta}\hat{l}(\mat\alpha, \mat\beta)  &=
        \sum_{i = 1}^n (\t{\vec(\mat{f}_{y_i})}\kron\t{\mat{r}_i}\widehat{\mat\Delta}^{-1}) (\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) (\vec \mat\alpha\kron \mat{I}_{k p}).
        {\color{gray}\qquad(p\times k)}
\end{align*}
These quantities are very verbose as well as completely unusable for an implementation. By detailed analysis of the gradients we see that the main parts are only element permutations with a high sparsity. By defining the following compact matrix
\begin{equation}\label{eq:permTransResponse}
    \mat G = \vec^{-1}_{q r}\bigg(\Big( \sum_{j = 1}^n \vec\mat{f}_{y_j}\otimes \widehat{\mat\Delta}^{-1}\mat{r}_j \Big)_{\pi(i)}\bigg)_{i = 1}^{p q k r}{\color{gray}\qquad(q r \times p k)}
\end{equation}
with $\pi$ being a permutation of $p q k r$ elements corresponding to permuting the axis of a 4D tensor of dimensions $p\times q\times k\times r$ by $(2, 4, 1, 3)$. As a generalization of transposition this leads to a rearrangement of the elements corresponding to the permuted 4D tensor with dimensions $q\times r\times p\times k$ which is then vectorized and reshaped into a matrix of dimensions $q r \times p k$. With $\mat G$ the gradients simplify to \todo{validate this mathematically}
\begin{align*}
    \nabla_{\mat\alpha}\hat{l}(\mat\alpha, \mat\beta) &=
        \vec_{q}^{-1}(\mat{G}\vec{\mat\beta}),
        {\color{gray}\qquad(q\times r)} \\
    \nabla_{\mat\beta}\hat{l}(\mat\alpha, \mat\beta)  &=
        \vec_{p}^{-1}(\t{\mat{G}}\vec{\mat\alpha}).
        {\color{gray}\qquad(p\times k)}
\end{align*}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%                      Kronecker Covariance Structure                      %%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Kronecker Covariance Structure}
As before we let the sample model for tensor valued opbservations and responses or oder $r$ be
\begin{displaymath}
    \ten{X} = \ten{\mu} + \ten{F}\times_{j\in[r]}\alpha_j + \ten{\epsilon}
\end{displaymath}
but the error tensor $\ten{\epsilon}\sim\mathcal{TN}(0, \mat{\Delta}_1, ..., \mat{\Delta}_r)$ is Tensor Normal distributed with mean zero and covariance matrices $\mat{\Delta}_1, ..., \mat{\Delta}_r$.

The sample model for $n$ observations has the same form with an additional sample axis in the last mode of $\ten{X}$ and $\ten{Y}$ with dimensions $p_1\times ...\times p_r\times n$ and $q_1\times ...\times q_r\times n$, respectively. Let the residual tensor be
\begin{displaymath}
    \ten{R} = \ten{X} - \ten{\mu} + \ten{F}\times_{j\in[r]}\alpha_j.
\end{displaymath}

By the definition of the Tensor Normal, using the notation $p_{\lnot j} = \prod_{k\neq j}p_j$, we get for observations $\ten{X}, \ten{F}$ the log-likelihood in terms of the residuals as
\begin{displaymath}
    l = -\frac{n p}{2}\log 2\pi
        -\sum_{j = 1}^r \frac{n p_{\lnot j}}{2}\log|\mat{\Delta}_j|
        -\frac{1}{2}\langle \ten{R}\times_{j\in[r]}\mat{\Delta}_j^{-1}, \ten{R} \rangle.
\end{displaymath}
Note that the log-likelihood depends on the covariance matrices $\mat{\Delta}_j$, $j = 1, ..., r$ as well as the mean $\mu$ and the parameter matrices $\mat{\alpha}_j$, $j = 1, ..., r$ through the residuals $\ten{R}$.

\subsection{MLE estimates}
For deriving the MLE estimates we compute the differential of the log-likelihood given the data as
\begin{displaymath}
    \d l =
        -\sum_{j = 1}^r \frac{n p_{\lnot j}}{2}\d\log|{\mat{\Delta}}_j|
        -\frac{1}{2}\sum_{j = 1}^r\langle {\ten{R}}\times_{k\in[r]\backslash j}{\mat{\Delta}}_k^{-1}\times_j\d{\mat{\Delta}}^{-1}_j \rangle
        -\langle {\ten{R}}\times_{j\in[r]}{\mat{\Delta}}_j^{-1}, \d{\ten{R}} \rangle.
\end{displaymath}
Using $\d\log|\mat{A}| = \tr(\mat{A}^{-1}\d\mat{A})$ and $\d\mat{A}^{-1} = -\mat{A}^{-1}(\d\mat{A})\mat{A}^{-1}$ as well as $\langle\ten{A}, \ten{B}\rangle = \tr(\ten{A}_{(j)}\t{\ten{B}_{(j)}})$ for any $j = 1, ..., r$ we get the differential of the estimated log-likelihood as
\begin{align*}
    \d \hat{l}
    &=
        -\sum_{j = 1}^r \frac{n p_{\lnot j}}{2}\tr(\widehat{\mat{\Delta}}_j^{-1}\d\widehat{\mat{\Delta}}_j)
        -\frac{1}{2}\sum_{j = 1}^r\tr\!\Big((\d{\widehat{\mat{\Delta}}}^{-1}_j)({\ten{R}}\times_{k\in[r]\backslash j}{\widehat{\mat{\Delta}}}_k^{-1})_{(j)}\t{\ten{R}_{(j)}}\Big)
        -\langle {\ten{R}}\times_{j\in[r]}{\widehat{\mat{\Delta}}}_j^{-1}, \d{\ten{R}} \rangle \\
    &=
        -\frac{1}{2}\sum_{j = 1}^r \tr\left(\Big(
            n p_{\lnot j}\mat{I}_{p_j} - \widehat{\mat{\Delta}}_j^{-1}(\ten{R}\times_{k\in[r]\backslash j}{\widehat{\mat{\Delta}}}_k^{-1})_{(j)}\t{\ten{R}_{(j)}}
        \right)\widehat{\mat{\Delta}}_j^{-1}\d\widehat{\mat{\Delta}}_j\Big)
        -\langle \ten{R}\times_{j\in[r]}{\widehat{\mat{\Delta}}}_j^{-1}, \d\ten{R} \rangle.
\end{align*}
With $\ten{R}$ not dependent on the $\widehat{\mat{\Delta}}_j$'s we get the covariance MLE estimates
\begin{align*}
    \widehat{\mat{\Delta}}_j &= \frac{1}{n p_{\lnot j}}(\ten{R}\times_{k\in[r]\backslash j}{\widehat{\mat{\Delta}}}_k^{-1})_{(j)}\t{\ten{R}_{(j)}}
        \qquad{\color{gray}p_j\times p_j} && j = 1, ..., r.
\end{align*}
as well as gradients (even though they are not realy used, except in the case of a pure gradient based estimation procedure which might ease the estimation burden as all the MLE estimates are cross dependent)
\begin{align*}
    \nabla_{\widehat{\mat{\Delta}}_j}\hat{l} &= \frac{1}{2}\widehat{\mat{\Delta}}_j^{-1}\big(
        \ten{R}_{(j)}\t{(\ten{R}\times_{k\in[r]\backslash j}{\widehat{\mat{\Delta}}}_k^{-1})_{(j)}}\widehat{\mat{\Delta}}_j^{-1} - n p_{\lnot j}\mat{I}_{p_j}
    \big) \\
        &= \frac{1}{2}\widehat{\mat{\Delta}}_j^{-1}\big(
            \ten{R}_{(j)}\t{(\ten{R}\times_{k\in[r]}{\widehat{\mat{\Delta}}}_k^{-1})_{(j)}} - n p_{\lnot j}\mat{I}_{p_j}
        \big)
        \qquad{\color{gray}p_j\times p_j} && j = 1, ..., r.
\end{align*}
We continue by substitution of the covariance estimates and get
\begin{align*}
    \d\hat{l} &= -\langle \ten{R}\times_{j\in[r]}{\widehat{\mat{\Delta}}}_j^{-1}, \d\ten{R} \rangle \\
        &= \sum_{j = 1}^r \langle \ten{R}\times_{j\in[r]}{\widehat{\mat{\Delta}}}_j^{-1}, \ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k\times_j\d\widehat{\mat{\alpha}}_j \rangle \\
        &= \sum_{j = 1}^r \tr\big( (\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}\t{(\ten{R}\times_{j\in[r]}{\widehat{\mat{\Delta}}}_j^{-1})_{(j)}}\d\widehat{\mat{\alpha}}_j \big).
\end{align*}
Through that the gradient for all the parameter matrices is
\begin{align*}
    \nabla_{\widehat{\mat{\alpha}}_j}\hat{l} &= (\ten{R}\times_{j\in[r]}{\widehat{\mat{\Delta}}}_j^{-1})_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}}
        \qquad{\color{gray}p_j\times q_j} && j = 1, ..., r.
\end{align*}

% Now we assume the residuals covariance has the form $\mat\Delta = \mat\Delta_1\otimes\mat\Delta_2$ where $\mat\Delta_1$, $\mat\Delta_2$ are $q\times q$, $p\times p$ covariance matrices, respectively. This is analog to the case that $\mat{R}_i$'s are i.i.d. Matrix Normal distribution
% \begin{displaymath}
%     \mat{R}_i = \mat{X}_i - \mat\mu - \mat\beta\mat{f}_{y_i}\t{\mat\alpha} \sim \mathcal{MN}_{p\times q}(\mat 0, \mat\Delta_2, \mat\Delta_1).
% \end{displaymath}
% The density of the Matrix Normal (with mean zero) is equivalent to the vectorized quantities being multivariate normal distributed with Kronecker structured covariance
% \begin{align*}
%     f(\mat R)
%         &= \frac{1}{\sqrt{(2\pi)^{p q}|\mat\Delta|}}\exp\left(-\frac{1}{2}\t{\vec(\mat{R})} \mat\Delta^{-1}\vec(\mat{R})\right) \\
%         &= \frac{1}{(2\pi)^{p q / 2}|\mat\Delta_1|^{p / 2}|\mat\Delta_2|^{q / 2}}\exp\left(-\frac{1}{2}\tr(\mat\Delta_1^{-1}\t{\mat{R}}\mat\Delta_2^{-1}\mat{R})\right)
% \end{align*}
% which leads for given data to the log-likelihood
% \begin{displaymath}
%     l(\mat{\mu}, \mat\Delta_1, \mat\Delta_2) =
%         -\frac{n p q}{2}\log 2\pi
%         -\frac{n p}{2}\log|\mat{\Delta}_1|
%         -\frac{n q}{2}\log|\mat{\Delta}_2|
%         -\frac{1}{2}\sum_{i = 1}^n \tr(\mat\Delta_1^{-1}\t{\mat{R}_i}\mat\Delta_2^{-1}\mat{R}_i).
% \end{displaymath}
% \subsection{MLE covariance estimates}
% Out first order of business is to derive the MLE estimated of the covariance matrices $\mat\Delta_1$, $\mat\Delta_2$ (the mean estimate $\widehat{\mat\mu}$ is trivial). Therefore, we look at the differentials with respect to changes in the covariance matrices as
% \begin{align*}
%     \d l(\mat\Delta_1, \mat\Delta_2) &=
%     -\frac{n p}{2}\d\log|\mat{\Delta}_1|
%     -\frac{n q}{2}\d\log|\mat{\Delta}_2|
%     -\frac{1}{2}\sum_{i = 1}^n
%         \tr(  (\d\mat\Delta_1^{-1})\t{\mat{R}_i}\mat\Delta_2^{-1}\mat{R}_i
%             + \mat\Delta_1^{-1}\t{\mat{R}_i}(\d\mat\Delta_2^{-1})\mat{R}_i) \\
%         &=
%     -\frac{n p}{2}\tr\mat{\Delta}_1^{-1}\d\mat{\Delta}_1
%     -\frac{n q}{2}\tr\mat{\Delta}_2^{-1}\d\mat{\Delta}_2 \\
%         &\qquad\qquad
%     +\frac{1}{2}\sum_{i = 1}^n
%         \tr(  \mat\Delta_1^{-1}(\d\mat\Delta_1)\mat\Delta_1^{-1}\t{\mat{R}_i}\mat\Delta_2^{-1}\mat{R}_i
%             + \mat\Delta_1^{-1}\t{\mat{R}_i}\mat\Delta_2^{-1}(\d\mat\Delta_2)\mat\Delta_2^{-1}\mat{R}_i) \\
%         &= \frac{1}{2}\tr\!\Big(\Big(
%             -n p \mat{I}_q + \mat\Delta_1^{-1}\sum_{i = 1}^n \t{\mat{R}_i}\mat\Delta_2^{-1}\mat{R}_i
%         \Big)\mat{\Delta}_1^{-1}\d\mat{\Delta}_1\Big) \\
%             &\qquad\qquad
%             + \frac{1}{2}\tr\!\Big(\Big(
%             -n q \mat{I}_p + \mat\Delta_2^{-1}\sum_{i = 1}^n \mat{R}_i\mat\Delta_1^{-1}\t{\mat{R}_i}
%         \Big)\mat{\Delta}_2^{-1}\d\mat{\Delta}_2\Big) \overset{!}{=} 0.
% \end{align*}
% Setting $\d l$ to zero yields the MLE estimates as
% \begin{displaymath}
%     \widehat{\mat{\mu}} = \overline{\mat X}{\color{gray}\quad(p\times q)}, \qquad
%     \widehat{\mat\Delta}_1 = \frac{1}{n p}\sum_{i = 1}^n \t{\mat{R}_i}\widehat{\mat\Delta}_2^{-1}\mat{R}_i{\color{gray}\quad(q\times q)}, \qquad
%     \widehat{\mat\Delta}_2 = \frac{1}{n q}\sum_{i = 1}^n \mat{R}_i\widehat{\mat\Delta}_1^{-1}\t{\mat{R}_i}{\color{gray}\quad(p\times p)}.
% \end{displaymath}
% Next, analog to above, we take the estimated log-likelihood and derive gradients with respect to $\mat{\alpha}$, $\mat{\beta}$.
% The estimated log-likelihood derives by replacing the unknown covariance matrices by there MLE estimates leading to
% \begin{displaymath}
%     \hat{l}(\mat\alpha, \mat\beta) =
%         -\frac{n p q}{2}\log 2\pi
%         -\frac{n p}{2}\log|\widehat{\mat{\Delta}}_1|
%         -\frac{n q}{2}\log|\widehat{\mat{\Delta}}_2|
%         -\frac{1}{2}\sum_{i = 1}^n \tr(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\mat{R}_i)
% \end{displaymath}
% and its differential
% \begin{displaymath}
%     \d\hat{l}(\mat\alpha, \mat\beta) =
%         -\frac{n p}{2}\d\log|\widehat{\mat{\Delta}}_1|
%         -\frac{n q}{2}\d\log|\widehat{\mat{\Delta}}_2|
%         -\frac{1}{2}\sum_{i = 1}^n \d\tr(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\mat{R}_i).
% \end{displaymath}
% We first take a closer look at the sum. After a bit of algebra using $\d\mat A^{-1} = -\mat A^{-1}(\d\mat A)\mat A^{-1}$ and the definitions of $\widehat{\mat\Delta}_1$, $\widehat{\mat\Delta}_2$ the sum can be rewritten
% \begin{displaymath}
%     \frac{1}{2}\sum_{i = 1}^n \d\tr(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\mat{R}_i)
%         = \sum_{i = 1}^n \tr(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\d\mat{R}_i)
%             - \frac{np}{2}\d\log|\widehat{\mat\Delta}_1|
%             - \frac{nq}{2}\d\log|\widehat{\mat\Delta}_2|.
% \end{displaymath}
% This means that most of the derivative cancels out and we get
% \begin{align*}
%     \d\hat{l}(\mat\alpha, \mat\beta)
%         &= \sum_{i = 1}^n \tr(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\d\mat{R}_i) \\
%         &= \sum_{i = 1}^n \tr(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}((\d\mat\beta)\mat{f}_{y_i}\t{\mat\alpha} + \mat\beta\mat{f}_{y_i}\t{(\d\mat\alpha}))) \\
%         &= \sum_{i = 1}^n \t{\vec(\widehat{\mat{\Delta}}_2^{-1}\mat{R}_i\widehat{\mat{\Delta}}_1^{-1}\mat\alpha\t{\mat{f}_{y_i}})}\d\vec\mat\beta
%             + \sum_{i = 1}^n \t{\vec(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\mat\beta\mat{f}_{y_i})}\d\vec\mat\alpha
% \end{align*}
% which means the gradients are
% \begin{align*}
%     \nabla_{\mat\alpha}\hat{l}(\mat\alpha, \mat\beta)
%         &= \sum_{i = 1}^n \widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\mat\beta\mat{f}_{y_i}
%         = (\ten{R}\ttm[3]\widehat{\mat{\Delta}}_1^{-1}\ttm[2]\widehat{\mat{\Delta}}_2^{-1})_{(3)}\t{(\ten{F}\ttm[2]\mat\beta)_{(3)}}\\
%     \nabla_{\mat\beta}\hat{l}(\mat\alpha, \mat\beta)
%         &= \sum_{i = 1}^n \widehat{\mat{\Delta}}_2^{-1}\mat{R}_i\widehat{\mat{\Delta}}_1^{-1}\mat\alpha\t{\mat{f}_{y_i}}
%         = (\ten{R}\ttm[3]\widehat{\mat{\Delta}}_1^{-1}\ttm[2]\widehat{\mat{\Delta}}_2^{-1})_{(2)}\t{(\ten{F}\ttm[3]\mat\alpha)_{(2)}}
% \end{align*}

\paragraph{Comparison to the general case:} There are two main differences, first the general case has a closed form solution for the gradient due to the explicit nature of the MLE estimate of $\widehat{\mat\Delta}$ compared to the mutually dependent MLE estimates $\widehat{\mat\Delta}_1$, $\widehat{\mat\Delta}_2$. On the other hand the general case has dramatically bigger dimensions of the covariance matrix ($p q \times p q$) compared to the two Kronecker components with dimensions $q \times q$ and $p \times p$. This means that in the general case there is a huge performance penalty in the dimensions of $\widehat{\mat\Delta}$ while in the other case an extra estimation is required to determine $\widehat{\mat\Delta}_1$, $\widehat{\mat\Delta}_2$.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%                     Alternative covariance estimates                     %%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Alternative covariance estimates}
An alternative approach is \emph{not} to use the MLE estimates for $\mat\Delta_1$, $\mat\Delta_2$ but (up to scaling) unbiased estimates.
\begin{displaymath}
    \widetilde{\mat\Delta}_1 = \frac{1}{n}\sum_{i = 1}^n \t{\mat{R}_i}\mat{R}_i {\color{gray}\quad(q\times q)},\qquad
    \widetilde{\mat\Delta}_2 = \frac{1}{n}\sum_{i = 1}^n \mat{R}_i\t{\mat{R}_i} {\color{gray}\quad(p\times p)}.
\end{displaymath}
The unbiasednes comes directly from the following short computation;
\begin{displaymath}
    (\E\widetilde{\mat\Delta}_1)_{j,k} = \frac{1}{n}\sum_{i = 1}^n \sum_{l = 1}^p \E \mat{R}_{i,l,j}\mat{R}_{i,l,k}
        = \frac{1}{n}\sum_{i = 1}^n \sum_{l = 1}^p (\mat{\Delta}_{2})_{l,l}(\mat{\Delta}_{1})_{j,k}
        = (\mat\Delta_1\tr(\mat\Delta_2))_{j,k}.
\end{displaymath}
which means that $\E\widetilde{\mat\Delta}_1 = \mat\Delta_1\tr(\mat\Delta_2)$ and in analogy $\E\widetilde{\mat\Delta}_2 = \mat\Delta_2\tr(\mat\Delta_1)$. Now, we need to handle the scaling which can be estimated unbiasedly by
\begin{displaymath}
    \tilde{s} = \frac{1}{n}\sum_{i = 1}^n \|\mat{R}_i\|_F^2
\end{displaymath}
because with $\|\mat{R}_i\|_F^2 = \tr \mat{R}_i\t{\mat{R}_i} = \tr \t{\mat{R}_i}\mat{R}_i$ the scale estimate $\tilde{s} = \tr(\widetilde{\mat\Delta}_1) = \tr(\widetilde{\mat\Delta}_2)$. Then $\E\tilde{s} = \tr(\E\widetilde{\mat\Delta}_1) = \tr{\mat\Delta}_1 \tr{\mat\Delta}_2 = \tr({\mat\Delta}_1\otimes{\mat\Delta}_2)$. Leading to the estimate of the covariance as
\begin{displaymath}
    \widetilde{\mat\Delta} = \tilde{s}^{-1}(\widetilde{\mat{\Delta}}_1\otimes\widetilde{\mat{\Delta}}_2)
\end{displaymath}

\todo{ prove they are consistent, especially $\widetilde{\mat\Delta} = \tilde{s}^{-1}(\widetilde{\mat\Delta}_1\otimes\widetilde{\mat\Delta}_2)$!}

The hoped for a benefit is that these covariance estimates are in a closed form which means there is no need for an additional iterative estimations step. Before we start with the derivation of the gradients define the following two quantities
\begin{align*}
    \mat{S}_1 = \frac{1}{n}\sum_{i = 1}^n \t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i = \frac{1}{n}\ten{R}_{(3)}\t{(\ten{R}\ttm[2]\widetilde{\mat{\Delta}}_2^{-1})_{(3)}}\quad{\color{gray}(q\times q)}, \\
    \mat{S}_2 = \frac{1}{n}\sum_{i = 1}^n \mat{R}_i\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i} = \frac{1}{n}\ten{R}_{(2)}\t{(\ten{R}\ttm[3]\widetilde{\mat{\Delta}}_1^{-1})_{(2)}}\quad{\color{gray}(p\times p)}.
\end{align*}
\todo{Check tensor form!}

Now, the matrix normal with the covariance matrix of the vectorized quantities of the form $\mat{\Delta} = s^{-1}(\mat{\Delta}_1\otimes\mat{\Delta}_2)$ has the form
\begin{align*}
    f(\mat R)
        &= \frac{1}{\sqrt{(2\pi)^{p q}|\mat\Delta|}}\exp\left(-\frac{1}{2}\t{\vec(\mat{R})} \mat\Delta^{-1}\vec(\mat{R})\right) \\
        &= \frac{s^{p q / 2}}{(2\pi)^{p q / 2}|\mat\Delta_1|^{p / 2}|\mat\Delta_2|^{q / 2}}\exp\left(-\frac{s}{2}\tr(\mat\Delta_1^{-1}\t{\mat{R}}\mat\Delta_2^{-1}\mat{R})\right)
\end{align*}

The approximated log-likelihood is then
\begin{align*}
    \tilde{l}(\mat\alpha, \mat\beta)
        &=
        -\frac{n p q}{2}\log{2\pi}
        -\frac{n}{2}\log|\widetilde{\mat{\Delta}}|
        -\frac{1}{2}\sum_{i = 1}^n \t{\mat{r}_i}\widetilde{\mat{\Delta}}^{-1}\mat{r}_i \\
        &=
        -\frac{n p q}{2}\log{2\pi}
        +\frac{n p q}{2}\log\tilde{s}
        -\frac{n p}{2}\log|\widetilde{\mat{\Delta}}_1|
        -\frac{n q}{2}\log|\widetilde{\mat{\Delta}}_2|
        -\frac{\tilde{s}}{2}\sum_{i = 1}^n \tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i).
\end{align*}
The second form is due to the property of the determinant for scaling and the Kronecker product giving that $|\widetilde{\mat\Delta}| = (\tilde{s}^{-1})^{p q}|\widetilde{\mat{\Delta}}_1|^p |\widetilde{\mat{\Delta}}_2|^q$ as well as an analog Kronecker decomposition as in the MLE case.

Note that with the following holds
\begin{displaymath}
    \sum_{i = 1}^n \tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i)
    = n \tr(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1)
    = n \tr(\widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2)
    = n \tr(\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1})
    = n \tr(\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1}).
\end{displaymath}

The derivation of the Gradient of the approximated log-likelihood $\tilde{l}$ is tedious but straight forward. We tackle the summands separately;
\begin{align*}
    \d\log\tilde{s} &= \tilde{s}^{-1}\d\tilde{s} = \frac{2}{n\tilde{s}}\sum_{i = 1}^n \tr(\t{\mat{R}_i}\d\mat{R}_i)
        = -\frac{2}{n\tilde{s}}\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\mat{R}_i\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\t{\mat{R}_i}\d\mat{\beta}), \\
    \d\log|\widetilde{\mat{\Delta}}_1| &=\tr(\widetilde{\mat{\Delta}}_1^{-1}\d\widetilde{\mat{\Delta}}_1) = \frac{2}{n}\sum_{i = 1}^n \tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\d\mat{R}_i)
        = -\frac{2}{n}\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\mat{R}_i\widetilde{\mat{\Delta}}_1^{-1}\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\d\mat{\beta}), \\
    \d\log|\widetilde{\mat{\Delta}}_2| &=\tr(\widetilde{\mat{\Delta}}_2^{-1}\d\widetilde{\mat{\Delta}}_2) = \frac{2}{n}\sum_{i = 1}^n \tr(\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\d\mat{R}_i)
        = -\frac{2}{n}\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\d\mat{\beta})
\end{align*}
as well as
\begin{displaymath}
    \d\,\tilde{s}\sum_{i = 1}^n \tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i)
        = (\d\tilde{s})\sum_{i = 1}^n \tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i)
            + \tilde{s}\, \d \sum_{i = 1}^n \tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i).
\end{displaymath}
We have
\begin{displaymath}
    \d\tilde{s} = -\frac{2}{n}\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\mat{R}_i\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\t{\mat{R}_i}\d\mat{\beta})
\end{displaymath}
and the remaining term
\begin{align*}
    \d\sum_{i = 1}^n\tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i)
         = 2\sum_{i = 1}^n \tr(&\t{\mat{f}_{y_i}}\t{\mat{\beta }}(\mat{R}_i   \widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1}  +    \widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1} \mat{R}_i - \widetilde{\mat{\Delta}}_2^{-1}   \mat{R}_i \widetilde{\mat{\Delta}}_1^{-1})\d\mat{\alpha} \\
         +\,&\mat{f}_{y_i} \t{\mat{\alpha}}(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i} + \t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1} - \widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1})\d\mat{\beta }).
\end{align*}
The last one is tedious but straight forward. Its computation extensively uses the symmetry of $\widetilde{\mat{\Delta}}_1$, $\widetilde{\mat{\Delta}}_2$, the cyclic property of the trace and the relation $\d\mat{A}^{-1} = -\mat{A}^{-1}(\d\mat{A})\mat{A}^{-1}$.

Putting it all together
\begin{align*}
    \d\tilde{l}(\mat{\alpha}, \mat{\beta})
       &= \frac{n p q}{2}\Big(-\frac{2}{n\tilde{s}}\Big)\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\mat{R}_i\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\t{\mat{R}_i}\d\mat{\beta}) \\
       &\hspace{3em} - \frac{n p}{2}\Big(-\frac{2}{n}\Big)\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\mat{R}_i\widetilde{\mat{\Delta}}_1^{-1}\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\d\mat{\beta}) \\
       &\hspace{3em} - \frac{n q}{2}\Big(-\frac{2}{n}\Big)\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\d\mat{\beta}) \\
       &\hspace{3em} -\frac{1}{2}\Big(-\frac{2}{n}\Big)\Big(\sum_{i = 1}^n \tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i)\Big)\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\mat{R}_i\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\t{\mat{R}_i}\d\mat{\beta}) \\
       &\hspace{3em} -\frac{\tilde{s}}{2}2\sum_{i = 1}^n \tr\!\Big(\t{\mat{f}_{y_i}}\t{\mat{\beta }}(\mat{R}_i \widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1} + \widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1} \mat{R}_i - \widetilde{\mat{\Delta}}_2^{-1} \mat{R}_i \widetilde{\mat{\Delta}}_1^{-1})\d\mat{\alpha} \\
       &\hspace{3em} \hspace{4.7em} + \mat{f}_{y_i} \t{\mat{\alpha}}(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i} + \t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1} - \widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1})\d\mat{\beta }\Big) \\
%
       &= \sum_{i = 1}^n \tr\bigg(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\Big(
            -p q \tilde{s}^{-1} \mat{R}_i + p \mat{R}_i\widetilde{\mat{\Delta}}_1^{-1} + q \widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i + \tr(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1)\mat{R}_i \\
       &\hspace{3em} \hspace{4.7em} - \tilde{s}(\mat{R}_i \widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1} + \widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1} \mat{R}_i - \widetilde{\mat{\Delta}}_2^{-1} \mat{R}_i \widetilde{\mat{\Delta}}_1^{-1})
       \Big)\d\mat{\alpha}\bigg) \\
       &\hspace{3em}+ \sum_{i = 1}^n \tr\bigg(\mat{f}_{y_i}\t{\mat{\alpha}}\Big(
            -p q \tilde{s}^{-1} \t{\mat{R}_i} + p \widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i} + q \t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1} + \tr(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1)\t{\mat{R}_i} \\
       &\hspace{3em}\hspace{3em} \hspace{4.7em} - \tilde{s}(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i} + \t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1} - \widetilde{\mat{\Delta}}_1^{-1} \t{\mat{R}_i} \widetilde{\mat{\Delta}}_2^{-1})
       \Big)\d\mat{\beta}\bigg).
\end{align*}
Observe that the bracketed expressions before $\d\mat{\alpha}$ and $\d\mat{\beta}$ are transposes. Lets denote the expression for $\d\mat{\alpha}$ as $\mat{G}_i$ which has the form
\begin{displaymath}
    \mat{G}_i
        = (\tr(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1) - p q \tilde{s}^{-1})\mat{R}_i
        + (q\mat{I}_p - \tilde{s}\widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2)\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i
        + \mat{R}_i\widetilde{\mat{\Delta}}_1^{-1}(p\mat{I}_q - \tilde{s}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1})
        + \tilde{s}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i\widetilde{\mat{\Delta}}_1^{-1}
\end{displaymath}
and with $\mathcal{G}$ the order 3 tensor stacking the $\mat{G}_i$'s such that the first mode indexes the observation
\begin{displaymath}
    \ten{G}
        = (\tr(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1) - p q \tilde{s}^{-1})\ten{R}
        + \ten{R}\ttm[2](q\mat{I}_p - \tilde{s}\widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2)\widetilde{\mat{\Delta}}_2^{-1}
        + \ten{R}\ttm[3](p\mat{I}_q - \tilde{s}\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1)\widetilde{\mat{\Delta}}_1^{-1}
        + \tilde{s}\ten{R}\ttm[2]\widetilde{\mat{\Delta}}_2^{-1}\ttm[3]\widetilde{\mat{\Delta}}_1^{-1}
\end{displaymath}
This leads to the following form of the differential of $\tilde{l}$ given by
\begin{displaymath}
    \d\tilde{l}(\mat{\alpha}, \mat{\beta})
        = \sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\mat{G}_i\d\mat{\alpha})
        + \sum_{i = 1}^n \tr(\mat{f}_{y_i}\t{\mat{\alpha}}\t{\mat{G}_i}\d\mat{\beta})
\end{displaymath}
and therefore the gradients
\begin{align*}
    \nabla_{\mat{\alpha}}\tilde{l}(\mat{\alpha}, \mat{\beta}) &= \sum_{i = 1}^n \t{\mat{G}_i}\mat{\beta}\mat{f}_{y_i}
        = \ten{G}_{(3)}\t{(\ten{F}\ttm[2]\mat{\beta})_{(3)}}, \\
    \nabla_{\mat{\beta}} \tilde{l}(\mat{\alpha}, \mat{\beta}) &= \sum_{i = 1}^n \mat{G}_i\mat{\alpha}\t{\mat{f}_{y_i}}
        = \ten{G}_{(2)}\t{(\ten{F}\ttm[3]\mat{\alpha})_{(2)}}.
\end{align*}

\todo{check the tensor version of the gradient!!!}

\newpage

\section{Thoughts on initial value estimation}
\todo{This section uses an alternative notation as it already tries to generalize to general multi-dimensional arrays. Furthermore, one of the main differences is that the observation are indexed in the \emph{last} mode. The benefit of this is that the mode product and parameter matrix indices match not only in the population model but also in sample versions.}
Let $\ten{X}, \ten{F}$ be order $r$ tensors of dimensions $p_1\times ... \times p_r$ and $q_1\times ... \times q_r$, respectively. Also denote the error tensor $\epsilon$ of the same order and dimensions as $\ten{X}$. The considered model for the $i$'th observation is
\begin{displaymath}
    \ten{X}_i = \ten{\mu} + \ten{F}_i\times\{ \mat{\alpha}_1, ..., \mat{\alpha}_r \} + \ten{\epsilon}_i
\end{displaymath}
where we assume $\ten{\epsilon}_i$ to be i.i.d. mean zero tensor normal distributed $\ten{\epsilon}\sim\mathcal{TN}(0, \mat{\Delta}_1, ..., \mat{\Delta}_r)$ for $\mat{\Delta}_j\in\mathcal{S}^{p_j}_{++}$, $j = 1, ..., r$. Given $i = 1, ..., n$ observations the collected model containing all observations
\begin{displaymath}
    \ten{X} = \ten{\mu} + \ten{F}\times\{ \mat{\alpha}_1, ..., \mat{\alpha}_r, \mat{I}_n \} + \ten{\epsilon}
\end{displaymath}
which is almost identical as the observations $\ten{X}_i, \ten{F}_i$ are stacked on an addition $r + 1$ mode leading to response, predictor and error tensors $\ten{X}, \ten{F}$ of order $r + 1$ and dimensions $p_1\times...\times p_r\times n$ for $\ten{X}, \ten{\epsilon}$ and $q_1\times...\times q_r\times n$ for $\ten{F}$.

In the following we assume w.l.o.g that $\ten{\mu} = 0$, as if this is not true we simply replace $\ten{X}_i$ with $\ten{X}_i - \ten{\mu}$ for $i = 1, ..., n$ before collecting all the observations in the response tensor $\ten{X}$.

The goal here is to find reasonable estimates for $\mat{\alpha}_j$, $j = 1, ..., n$ for the mean model
\begin{displaymath}
    \E \ten{X}|\ten{F}, \mat{\alpha}_1, ..., \mat{\alpha}_r = \ten{F}\times\{\mat{\alpha}_1, ..., \mat{\alpha}_r, \mat{I}_n\}
        = \ten{F}\times_{j\in[r]}\mat{\alpha}_j.
\end{displaymath}
Under the mean model we have using the general mode product relation $(\ten{A}\times_j\mat{B})_{(j)} = \mat{B}\ten{A}_{(j)}$ we get
\begin{align*}
    \ten{X}_{(j)}\t{\ten{X}_{(j)}} \overset{\text{SVD}}{=} \mat{U}_j\mat{D}_j\t{\mat{U}_j}
        = \mat{\alpha}_j(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}
          \t{(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}}\t{\mat{\alpha}_j}
\end{align*}
for the $j = 1, ..., r$ modes. Using this relation we construct an iterative estimation process by setting the initial estimates of $\hat{\mat{\alpha}}_j^{(0)} = \mat{U}_j[, 1:q_j]$ which are the first $q_j$ columns of $\mat{U}_j$.

For getting least squares estimates for $\mat{\alpha}_j$, $j = 1, ..., r$ we observe that by matricization of the mean model
\begin{displaymath}
    \ten{X}_{(j)} = (\ten{F}\times_{k\in[r]}\mat{\alpha}_k)_{(j)} = \mat{\alpha}_j(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}
\end{displaymath}
leads to normal equations for each $\mat{\alpha}_j$, $j = 1, ..., r$
\begin{displaymath}
    \ten{X}_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}} = \mat{\alpha}_j(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}}
\end{displaymath}
where the normal equations for $\mat{\alpha}_j$ depend on all the other $\mat{\alpha}_k$. With the initial estimates from above this allows an alternating approach. Index with $t = 1, ...$ the current iteration, then a new estimate $\widehat{\mat{\alpha}}_j^{(t)}$ given the previous estimates $\widehat{\mat{\alpha}}_k^{(t-1)}$, $k = 1, ..., r$ is computed as
\begin{displaymath}
    \widehat{\mat{\alpha}}_j^{(t)} =
        \ten{X}_{(j)}
        \t{\big(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k^{(t-1)}\big)_{(j)}}
    \left(
        \big(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k^{(t-1)}\big)_{(j)}
        \t{\big(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k^{(t-1)}\big)_{(j)}}
    \right)^{-1}
\end{displaymath}
for $j = 1, ..., r$ until convergence or a maximum number of iterations is exceeded. The final estimates are the least squares estimates by this procedure.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%                            Numerical Examples                            %%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Numerical Examples}
% The first example (which by it self is \emph{not} exemplary) is the estimation with parameters $n = 200$, $p = 11$, $q = 5$, $k = 14$ and $r = 9$. The ``true'' matrices $\mat\alpha$, $\mat\beta$ generated by sampling there elements i.i.d. standard normal like the responses $y$. Then, for each observation, $\mat{f}_y$ is computed as $\sin(s_{i, j} y + o_{i j})$ \todo{ properly describe} to fill the elements of $\mat{f}_y$. Then the $\mat{X}$'s are samples as
% \begin{displaymath}
%     \mat{X} = \mat{\beta}\mat{f}_y \t{\mat{\alpha}} + \mat{\epsilon}, \qquad \vec{\mat{\epsilon}} \sim \mathbb{N}_{p q}(\mat{0}, \mat{\Delta})
% \end{displaymath}
% where $\mat{\Delta}_{i j} = 0.5^{|i - j|}$ for $i, j = 1, ..., p q$.

\begin{table}[!ht]
    \centering
    % see: https://en.wikibooks.org/wiki/LaTeX/Tables
    \begin{tabular}{ll|r@{ }l *{3}{r@{.}l}}
          method & init
            & \multicolumn{2}{c}{loss}
            & \multicolumn{2}{c}{MSE}
            & \multicolumn{2}{c}{$\dist(\hat{\mat\alpha}, \mat\alpha)$}
            & \multicolumn{2}{c}{$\dist(\hat{\mat\beta}, \mat\beta)$}
            \\ \hline
          base     &  vlp & -2642&(1594) & 1&82  (2.714) & 0&248 (0.447) & 0&271 (0.458) \\
          new      &  vlp & -2704&(1452) & 1&78  (2.658) & 0&233 (0.438) & 0&260 (0.448) \\
          new      &   ls & -3479&  (95) & 0&99  (0.025) & 0&037 (0.017) & 0&035 (0.015) \\
          momentum &  vlp & -2704&(1452) & 1&78  (2.658) & 0&233 (0.438) & 0&260 (0.448) \\
          momentum &   ls & -3479&  (95) & 0&99  (0.025) & 0&037 (0.017) & 0&035 (0.015) \\
          approx   &  vlp &  6819&(1995) & 3&99 (12.256) & 0&267 (0.448) & 0&287 (0.457) \\
          approx   &   ls &  5457& (163) & 0&99  (0.025) & 0&033 (0.017) & 0&030 (0.012) \\
    \end{tabular}
    \caption{Mean (standard deviation) for simulated runs of $20$ repititions for the model $\mat{X} = \mat{\beta}\mat{f}_y\t{\mat{\alpha}}$ of dimensinos $(p, q) = (11, 7)$, $(k, r) = (3, 5)$ with a sample size of $n = 200$. The covariance structure is $\mat{\Delta} = \mat{\Delta}_2\otimes \mat{\Delta}_1$ for $\Delta_i = \text{AR}(\sqrt{0.5})$, $i = 1, 2$. The functions applied to the standard normal response $y$ are $\sin, \cos$ with increasing frequency.}
\end{table}

% \begin{figure}
%     \centering
%     \includegraphics{loss_Ex01.png}
% \end{figure}
% \begin{figure}
%     \centering
%     \includegraphics{estimates_Ex01.png}
% \end{figure}
% \begin{figure}
%     \centering
%     \includegraphics{Delta_Ex01.png}
% \end{figure}
% \begin{figure}
%     \centering
%     \includegraphics{hist_Ex01.png}
% \end{figure}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%                              Bib and Index                               %%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\printindex
\nocite{*}
\printbibliography

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%                                 Appendix                                 %%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\appendix
\section{Matrix Differential Rules}
Let $\mat A$ be a square matrix (and invertible if needed) and $|.|$ stands for the determinant
\begin{align*}
    \d\log\mat A &= \frac{1}{|\mat A|}\d\mat{A} \\
    \d|\mat A| &= |\mat A|\tr \mat{A}^{-1}\d\mat A \\
    \d\log|\mat A| &= \tr\mat{A}^{-1}\d\mat A \\
    \d\mat{X}^{-1} &= -\mat{X}^{-1}(\d\mat{X})\mat{X}^{-1}
\end{align*}

\section{Useful Matrix Identities}
In this section we summarize a few useful matrix identities, for more details see for example \cite{MatrixAlgebra-AbadirMagnus2005}.

For two matrices $\mat A$ of dimensions $q\times r$ and $\mat B$ of dimensions $p\times k$ holds
\begin{equation}\label{eq:vecKron}
    \vec(\mat A\kron\mat B) = (\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p)(\vec\mat A\kron\vec\mat B).
\end{equation}

Let $\mat A$ be a $p\times p$ dimensional non-singular matrix. Furthermore, let $\mat a, \mat b$ be $p$ vectors such that $\t{\mat b}A^{-1}\mat a\neq -1$, then
\begin{displaymath}
    (\mat A + \mat a\t{\mat b})^{-1} = \mat{A}^{-1} - \frac{1}{1 + \t{\mat b}A^{-1}\mat a}\mat{A}^{-1}\mat{a}\t{\mat{b}}\mat{A}^{-1}
\end{displaymath}
as well as
\begin{displaymath}
    \det(\mat A + \mat a\t{\mat b}) = \det(\mat A)(1 + \t{\mat b}{\mat A}^{-1}\mat a)
\end{displaymath}
which even holds in the case $\t{\mat b}A^{-1}\mat a = -1$. This is known as Sylvester's determinant theorem.


\section{Commutation Matrix and Permutation Identities}
\begin{center}
    Note: In this section we use 0-indexing for the sake of simplicity!
\end{center}
In this section we summarize relations between the commutation matrix and corresponding permutation. We also list some extensions to ``simplify'' or represent some term. This is mostly intended for implementation purposes and understanding of terms occurring in the computations.

Let $\mat A$ be an arbitrary $p\times q$ matrix. The permutation matrix $\mat K_{p, q}$ satisfies
\begin{displaymath}
    \mat{K}_{p, q}\vec{\mat{A}} = \vec{\t{\mat{A}}} \quad\Leftrightarrow\quad (\vec{\mat{A}})_{\pi_{p, q}(i)} = (\vec{\t{\mat{A}}})_{i}, \quad\text{for } i = 0, ..., p q - 1
\end{displaymath}
where $\pi_{p, q}$ is a permutation of the indices $i = 0, ..., p q - 1$ such that
\begin{displaymath}
    \pi_{p, q}(i + j p) = j + i q, \quad\text{for }i = 0, ..., p - 1; j = 0, ..., q - 1.
\end{displaymath}

\begin{table}[!htp]
    \centering
    \begin{minipage}{0.8\textwidth}
        \centering
        \begin{tabular}{l c l}
            $\mat{K}_{p, q}$                 & $\hat{=}$ & $\pi_{p, q}(i + j p) = j + i q$ \\
            $\mat{I}_r\kron\mat{K}_{p, q}$ & $\hat{=}$ & $\tilde{\pi}_{p, q, r}(i + j p + k p q) = j + i q + k p q$ \\
            $\mat{K}_{p, q}\kron\mat{I}_r$ & $\hat{=}$ & $\hat{\pi}_{p, q, r}(i + j p + k p q) = r(j + i q) + k$
        \end{tabular}
    \caption{\label{tab:commutation-permutation}Commutation matrix terms and corresponding permutations. Indices are all 0-indexed with the ranges; $i = 0, ..., p - 1$, $j = 0, ..., q - 1$ and $k = 0, ..., r - 1$.}
    \end{minipage}
\end{table}


\section{Matrix and Tensor Operations}

The \emph{Kronecker product}\index{Operations!Kronecker@$\kron$ Kronecker product} is denoted as $\kron$ and the \emph{Hadamard product} uses the symbol $\circ$. We also need the \emph{Khatri-Rao product}\index{Operations!KhatriRao@$\hada$ Khatri-Rao product}
$\hada$ as well as the \emph{Transposed Khatri-Rao product} $\odot_t$ (or \emph{Face-Splitting product}). There is also the \emph{$n$-mode Tensor Matrix Product}\index{Operations!ttm@$\ttm[n]$ $n$-mode tensor product} denoted by $\ttm[n]$ in conjunction with the \emph{$n$-mode Matricization} of a Tensor $\mat{T}$ written as $\mat{T}_{(n)}$, which is a matrix. See below for definitions and examples of these operations.\todo{ Definitions and Examples}

\todo{ resolve confusion between Khatri-Rao, Column-wise Kronecker / Khatri-Rao, Row-wise Kronecker / Khatri-Rao, Face-Splitting Product, .... Yes, its a mess.}
\paragraph{Kronecker Product $\kron$:}
\paragraph{Khatri-Rao Product $\hada$:}
\paragraph{Transposed Khatri-Rao Product $\odot_t$:} This is also known as the Face-Splitting Product and is the row-wise Kronecker product of two matrices. If relates to the Column-wise Kronecker Product through
\begin{displaymath}
    \t{(\mat{A}\odot_{t}\mat{B})} = \t{\mat{A}}\hada\t{\mat{B}}
\end{displaymath}

\paragraph{$n$-mode unfolding:} \emph{Unfolding}, also known as \emph{flattening} or \emph{matricization}, is an reshaping of a tensor into a matrix with rearrangement of the elements such that mode $n$ corresponds to columns of the result matrix and all other modes are vectorized in the rows. Let $\ten{T}$ be a tensor of order $m$ with dimensions $t_1\times ... \times t_n\times ... \times t_m$ and elements indexed by $(i_1, ..., i_n, ..., i_m)$. The $n$-mode flattening, denoted $\ten{T}_{(n)}$, is defined as a $(t_n, \prod_{k\neq n}t_k)$ matrix with element indices $(i_n, j)$ such that $j = \sum_{k = 1, k\neq n}^m i_k\prod_{l = 1, l\neq n}^{k - 1}t_l$.
\todo{ give an example!}

\paragraph{$n$-mode Tensor Product $\ttm[n]$:}
The \emph{$n$-mode tensor product} $\ttm[n]$ between a tensor $\mat{T}$ of order $m$ with dimensions $t_1\times t_2\times ... \times t_n\times ... \times t_m$ and a $p\times t_n$ matrix $\mat{M}$ is defined element-wise as
\begin{displaymath}
    (\ten{T}\ttm[n] \mat{M})_{i_1, ..., i_{n-1}, j, i_{n+1}, ..., i_m} = \sum_{k = 1}^{t_n} \ten{T}_{i_1, ..., i_{n-1}, k, i_{n+1}, ..., i_m} \mat{M}_{j, k}
\end{displaymath}
where $i_1, ..., i_{n-1}, i_{n+1}, ..., i_m$ run from $1$ to $t_1, ..., t_{n-1}, t_{n+1}, ..., t_m$, respectively. Furthermore, the $n$-th fiber index $j$ of the product ranges from $1$ to $p$. This gives a new tensor $\mat{T}\ttm[n]\mat{M}$ of order $m$ with dimensions $t_1\times t_2\times ... \times p\times ... \times t_m$.

\begin{example}[Matrix Multiplication Analogs]
    Let $\mat{A}$, $\mat{B}$ be two matrices with dimensions $t_1\times t_2$ and $p\times q$, respectively. Then $\mat{A}$ is also a tensor of order $2$, now the $1$-mode and $2$-mode products are element wise given by
    \begin{align*}
        (\mat{A}\ttm[1] \mat{B})_{i,j} &= \sum_{l = 1}^{t_1} \mat{A}_{l,j}\mat{B}_{i,l}
            = (\mat{B}\mat{A})_{i,j}
                                        & \text{for }t_1 = q, \\
        (\mat{A}\ttm[2] \mat{B})_{i,j} &= \sum_{l = 1}^{t_2} \mat{A}_{i,l}\mat{B}_{j,l}
            = (\mat{A}\t{\mat{B}})_{i,j} = \t{(\mat{B}\t{\mat{A}})}_{i,j}
                                        & \text{for }t_2 = q.
    \end{align*}
    In other words, the $1$-mode product equals $\mat{A}\ttm[1] \mat{B} = \mat{B}\mat{A}$ and the $2$-mode is $\mat{A}\ttm[2] \mat{B} = \t{(\mat{B}\t{\mat{A}})}$ in the case of the tensor $\mat{A}$ being a matrix.
\end{example}

\begin{example}[Order Three Analogs]
    Let $\mat{A}$ be a tensor of the form $t_1\times t_2\times t_3$ and $\mat{B}$ a matrix of dimensions $p\times q$, then the $n$-mode products have the following look
    \begin{align*}
        (\mat{A}\ttm[1]\mat{B})_{i,j,k} &= \sum_{l = 1}^{t_1} \mat{A}_{l,j,k}\mat{B}_{i,l} & \text{for }t_1 = q, \\
        (\mat{A}\ttm[2]\mat{B})_{i,j,k} &= \sum_{l = 1}^{t_2} \mat{A}_{i,l,k}\mat{B}_{j,l} \equiv (\mat{B}\mat{A}_{i,:,:})_{j,k} & \text{for }t_2 = q, \\
        (\mat{A}\ttm[3]\mat{B})_{i,j,k} &= \sum_{l = 1}^{t_3} \mat{A}_{i,j,l}\mat{B}_{k,l} \equiv \t{(\mat{B}\t{\mat{A}_{i,:,:}})}_{j,k} & \text{for }t_3 = q.
    \end{align*}
\end{example}

Letting $\ten{F}$ be the $3$-tensor of dimensions $n\times k\times r$ such that $\ten{F}_{i,:,:} = \mat{f}_{y_i}$, then
\begin{displaymath}
    \mat{\beta}\mat{f}_{y_i}\t{\mat{\alpha}} = (\ten{F}\ttm[2]\mat{\beta}\ttm[3]\mat{\alpha})_{i,:,:}
\end{displaymath}
or in other words, the $i$-th slice of the tensor product $\ten{F}\ttm[2]\mat{\beta}\ttm[3]\mat{\alpha}$ contains $\mat{\beta}\mat{f}_{y_i}\t{\mat{\alpha}}$ for $i = 1, ..., n$.
Another analog way of writing this is
\begin{displaymath}
    (\ten{F}\ttm[2]\mat{\beta}\ttm[3]\mat{\alpha})_{(1)} = \mathbb{F}_{y}(\t{\mat{\alpha}}\kron\t{\mat{\beta}})
\end{displaymath}

\section{Equivalencies}
In this section we give a short summary of alternative but equivalent operations.
Using the notation $\widehat{=}$ to indicate that two expressions are identical in the sense that they contain the same element in the same order but may have different dimensions. Meaning, when vectorizing ether side of $\widehat{=}$, they are equal ($\mat{A}\widehat{=}\mat{B}\ :\Leftrightarrow\ \vec{\mat{A}} = \vec{\mat{B}}$).

Therefore, we use $\mat{A}, \mat{B}, \mat{X}, \mat{F}, \mat{R}, ...$ for matrices. 3-Tensors are written as $\ten{A}, \ten{B}, \ten{T}, \ten{X}, \ten{F}, \ten{R}, ...$.

\begin{align*}
    \ten{T}\ttm[3]\mat{A}\ &{\widehat{=}}\ \mat{T}\t{\mat A} & \ten{T}(n, p, q)\ \widehat{=}\ \mat{T}(n p, q), \mat{A}(p, q) \\
    \ten{T}\ttm[2]\mat{B}\ &{\widehat{=}}\ \mat{B}\ten{T}_{(2)} & \ten{T}(n, p, q), \ten{T}_{(2)}(p, n q), \mat{B}(q, p)
\end{align*}

% \section{Matrix Valued Normal Distribution}
% A random variable $\mat{X}$ of dimensions $p\times q$ is \emph{Matrix-Valued Normal Distribution}, denoted 
% \begin{displaymath}
%     \mat{X}\sim\mathcal{MN}_{p\times q}(\mat{\mu}, \mat{\Delta}_2, \mat{\Delta}_1),
% \end{displaymath}
% if and only if $\vec\mat{X}\sim\mathcal{N}_{p q}(\vec\mat\mu, \mat\Delta_1\otimes\mat\Delta_2)$. Note the order of the covariance matrices $\mat\Delta_1, \mat\Delta_2$. Its density is given by
% \begin{displaymath}
%     f(\mat{X}) = \frac{1}{(2\pi)^{p q / 2}|\mat\Delta_1|^{p / 2}|\mat\Delta_2|^{q / 2}}\exp\left(-\frac{1}{2}\tr(\mat\Delta_1^{-1}\t{(\mat X - \mat \mu)}\mat\Delta_2^{-1}(\mat X - \mat \mu))\right).
% \end{displaymath}

% \section{Sampling form a Multi-Array Normal Distribution}
% Let $\ten{X}$ be an order (rank) $r$ Multi-Array random variable of dimensions $p_1\times...\times p_r$ following a Multi-Array (or Tensor) Normal distributed
% \begin{displaymath}
%     \ten{X}\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r).
% \end{displaymath}
% Its density is given by
% \begin{displaymath}
%     f(\ten{X}) = \Big( \prod_{i = 1}^r \sqrt{(2\pi)^{p_i}|\mat{\Delta}_i|^{q_i}} \Big)^{-1}
%         \exp\!\left( -\frac{1}{2}\langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle \right)
% \end{displaymath}
% with $q_i = \prod_{j \neq i}p_j$. This is equivalent to the vectorized $\vec\ten{X}$ following a Multi-Variate Normal distribution
% \begin{displaymath}
%     \vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1)
% \end{displaymath}
% with $p = \prod_{i = 1}^r p_i$.

% \todo{Check this!!!}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%                           Reference Summaries                            %%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Reference Summaries}
This section contains short summaries of the main references with each sub-section concerning one paper.

\subsection{}

\subsection{Generalized Tensor Decomposition With Features on Multiple Modes}
The \cite{TensorDecomp-HuLeeWang2022} paper proposes a multi-linear conditional mean model for a constraint rank tensor decomposition. Let the responses $\ten{Y}\in\mathbb{R}^{d_1\times ... \times\d_K}$ be an order $K$ tensor. Associated with each mode $k\in[K]$ they assume feature matrices $\mat{X}_k\in\mathbb{R}^{d_k\times p_k}$. Now, they assume that conditional on the feature matrices $\mat{X}_k$ the entries of the tensor $\ten{Y}$ are independent realizations. The rank constraint is specified through $\mat{r} = (r_1, ..., r_K)$, then the model is given by
\begin{displaymath}
    \E(\ten{Y} | \mat{X}_1, ..., \mat{X}_K) = f(\ten{C}\times\{ \mat{X}_1\mat{M}_1, ..., \mat{X}_K\mat{M}_K \}),\qquad \t{\mat{M}_k}\mat{M}_k = \mat{I}_{r_k}\ \forall k\in[K].
\end{displaymath}
The order $K$ tensor $\ten{C}\in\mathbb{R}^{r_1\times...\times r_K}$ is an unknown full-rank core tensor and the matrices $\mat{M}_k\in\mathbb{R}^{p_k\times r_k}$ are unknown factor matrices. The function $f$ is applied element wise and serves as the link function based on the assumed distribution family of the tensor entries. Finally, the operation $\times$ denotes the tensor-by-matrix product using a short hand
\begin{displaymath}
    \ten{C}\times\{ \mat{X}_1\mat{M}_1, ..., \mat{X}_K\mat{M}_K \}
        = \ten{C}\ttm[1]\mat{X}_1\mat{M}_1\ ...\ttm[K]\mat{X}_K\mat{M}_K
\end{displaymath}
with $\ttm[k]$ denoting the $k$-mode tensor matrix product.

The algorithm for estimation of $\ten{C}$ and $\mat{M}_1, ..., \mat{M}_K$ assumes the individual conditional entries of $\ten{Y}$ to be independent and to follow a generalized linear model with link function $f$. The proposed algorithm is an iterative algorithm for minimizing the negative log-likelihood
\begin{displaymath}
    l(\ten{C}, \mat{M}_1, ..., \mat{M}_K) = \langle \ten{Y}, \Theta \rangle - \sum_{i_1, ..., i_K} b(\Theta_{i_1, ..., i_K}), \qquad \Theta = \ten{C}\times\{ \mat{X}_1\mat{M}_1, ..., \mat{X}_K\mat{M}_K \}
\end{displaymath}
where $b = f'$ it the derivative of the canonical link function $f$ in the generalized linear model the conditioned entries of $\ten{Y}$ follow. The algorithm utilizes the higher-order SVD (HOSVD) to enforce the rank-constraint.

The main benefit is that this approach generalizes well to a multitude of different structured data sets.

\todo{ how does this relate to the $\mat{X} = \mat{\mu} + \mat{\beta}\mat{f}_y\t{\mat{\alpha}} + \mat{\epsilon}$ model.}

\end{document}