tensor_predictors/LaTeX/GMLM.tex

\documentclass[a4paper, 10pt]{article}

\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{fullpage}
\usepackage{amsmath, amssymb, amstext, amsthm, scalerel, bm}
\usepackage{makeidx}                    % Index (Symbols, Names, ...)
\usepackage{xcolor, graphicx}           % colors and including images
\usepackage{tikz}
\usepackage{algorithm, algpseudocode}   % Pseudo Codes / Algorithms
\usepackage[pdftex]{hyperref}           % Load as last package! Redefines commands
\usetikzlibrary{calc, 3d}

% defines a new TiKz option `canvas is plane={O(#1,#2,#3)x(#4,#5,#6)y(#7,#8,#9)}`
\makeatletter
\tikzoption{canvas is plane}[]{\@setOxy#1}
\def\@setOxy O(#1,#2,#3)x(#4,#5,#6)y(#7,#8,#9){%
    \def\tikz@plane@origin{\pgfpointxyz{#1}{#2}{#3}}%
    \def\tikz@plane@x{\pgfpointxyz{#4}{#5}{#6}}%
    \def\tikz@plane@y{\pgfpointxyz{#7}{#8}{#9}}%
    \tikz@canvas@is@plane%
}
\makeatother

\usepackage[
    % backend=bibtex,
    style=authoryear-comp
]{biblatex}

% Document meta into
\title{Generalized Multi-Linear Modeling for Sufficient Dimension Reduction for the Quadratic Exponential Family}
\author{Daniel Kapla}
\date{\today}
% Set PDF title, author and creator.
\AtBeginDocument{
    \hypersetup{
        pdftitle = {Generalized Multi-Linear Modeling for Sufficient Dimension Reduction for the Quadratic Exponential Family},
        pdfauthor = {Daniel Kapla},
        pdfcreator = {\pdftexbanner}
    }
}

% Bibliography resource(s)
\addbibresource{main.bib}

% Setup environments
% Theorem, Lemma
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{example}{Example}
% Definition
\theoremstyle{definition}
\newtheorem{defn}{Definition}
% Remark
\theoremstyle{remark}
\newtheorem{remark}{Remark}

% Define math macros
\newcommand*{\mat}[1]{\boldsymbol{#1}}
\newcommand*{\ten}[1]{\mathcal{#1}}
\renewcommand*{\vec}{\operatorname{vec}}
\newcommand*{\unvec}{\operatorname{vec^{-1}}}
\newcommand*{\reshape}[1]{\operatorname{reshape}_{#1}}
\newcommand*{\vech}{\operatorname{vech}}
\newcommand*{\rank}{\operatorname{rank}}
\newcommand*{\diag}{\operatorname{diag}}
\newcommand*{\perm}[1]{\mathfrak{S}_{#1}}       % set of permutations of size #1
\newcommand*{\len}[1]{\#{#1}}                   % length of #1
\DeclareMathOperator*{\ttt}{\circledast}
\DeclareMathOperator{\tr}{tr}
\DeclareMathOperator{\var}{Var}
\DeclareMathOperator{\cov}{Cov}
\DeclareMathOperator{\Span}{span}
\DeclareMathOperator{\E}{\operatorname{\mathbb{E}}}
% \DeclareMathOperator{\independent}{{\bot\!\!\!\bot}}
\DeclareMathOperator*{\argmin}{{arg\,min}}
\DeclareMathOperator*{\argmax}{{arg\,max}}
\newcommand*{\D}{\textnormal{D}}         % derivative
\renewcommand*{\H}{\textnormal{H}}         % hessian
\renewcommand*{\d}{\textnormal{d}}       % differential
\renewcommand*{\t}[1]{{#1^{\prime}}}     % matrix transpose
\newcommand*{\pinv}[1]{{#1^{\dagger}}}   % `Moore-Penrose pseudoinverse`
\newcommand*{\invlink}{\widetilde{\mat{g}}}

\newcommand{\todo}[1]{{\color{red}TODO: #1}}
\newcommand{\effie}[1]{{\color{blue}Effie: #1}}


%%% Custom operators with ether one or two arguments (limits)
\makeatletter
%%% Multi-Linear Multiplication
% Save first argument as \arg@one
\def\mlm#1{\def\arg@one{#1}\futurelet\next\mlm@i}
% Check for second argument
\def\mlm@i{\ifx\next\bgroup\expandafter\mlm@two\else\expandafter\mlm@one\fi}
% specialization for one or two arguments, both versions use saved first argument
\def\mlm@one{\mathchoice%
    {\operatorname*{\scalerel*{\times}{\bigotimes}}_{\makebox[0pt][c]{$\scriptstyle \arg@one$}}}%
    {\operatorname*{\scalerel*{\times}{\bigotimes}}_{\arg@one}}%
    {\operatorname*{\scalerel*{\times}{\bigotimes}}_{\arg@one}}%
    {\operatorname*{\scalerel*{\times}{\bigotimes}}_{\arg@one}}%
}
% this commands single argument is the second argument of \mlm
\def\mlm@two#1{\mathchoice%
    {\operatorname*{\scalerel*{\times}{\bigotimes}}_{\makebox[0pt][c]{$\scriptstyle \arg@one$}}^{\makebox[0pt][c]{$\scriptstyle #1$}}}%
    {\operatorname*{\scalerel*{\times}{\bigotimes}}_{\arg@one}^{#1}}%
    {\operatorname*{\scalerel*{\times}{\bigotimes}}_{\arg@one}^{#1}}%
    {\operatorname*{\scalerel*{\times}{\bigotimes}}_{\arg@one}^{#1}}%
}

%%% Big Kronecker Product (with overflowing limits)
% Save first argument as \arg@one
\def\bigkron#1{\def\arg@one{#1}\futurelet\next\bigkron@i}
% Check for second argument
\def\bigkron@i{\ifx\next\bgroup\expandafter\bigkron@two\else\expandafter\bigkron@one\fi}
% specialization for one or two arguments, both versions use saved first argument
\def\bigkron@one{\mathchoice%
    {\bigotimes_{\makebox[0pt][c]{$\scriptstyle \arg@one$}}}%
    {\bigotimes_{\arg@one}}%
    {\bigotimes_{\arg@one}}%
    {\bigotimes_{\arg@one}}%
}
% this commands single argument is the second argument of \bigkron
\def\bigkron@two#1{\mathchoice%
    {\bigotimes_{\makebox[0pt][c]{$\scriptstyle \arg@one$}}^{\makebox[0pt][c]{$\scriptstyle #1$}}}%
    {\bigotimes_{\arg@one}^{#1}}%
    {\bigotimes_{\arg@one}^{#1}}%
    {\bigotimes_{\arg@one}^{#1}}%
}
\makeatother

% Pseudo Code Commands
\newcommand{\algorithmicbreak}{\textbf{break}}
\newcommand{\Break}{\State \algorithmicbreak}


\begin{document}

\maketitle


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{abstract}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    We consider regression and classification for \textit{general} response and tensor-valued predictors (multi dimensional arrays) and propose a \textit{novel formulation} for sufficient dimension reduction. Assuming the distribution of the tensor-valued predictors given the response is in the quadratic exponential family, we model the natural parameter as a multi-linear function of the respons.
    This allows per-axis reductions that drastically reduce the total number of parameters for higher order tensor-valued predictors. We derive maximum likelihood estimates for the sufficient dimension reduction and a computationally efficient estimation algorithm which leveraes the tensor structure. The performance of the method is illustrated via simulations and real world examples are provided.
\end{abstract}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
                        \section{Introduction}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Notation}
Vectors are write as boldface lowercase letters (e.g. $\mat a$, $\mat b$), matrices use boldface uppercase or Greek letters (e.g. $\mat A$, $\mat B$, $\mat\alpha$, $\mat\Delta$). The identity matrix of dimensions $p\times p$ is denoted by $\mat{I}_p$ and the commutation matrix as $\mat{K}_{p, q}$ or $\mat{K}_p$ is case of $p = q$. Tensors, meaning multi-dimensional arrays of order at least 3, use uppercase calligraphic letters (e.g. $\ten{A}$, $\ten{B}$, $\ten{X}$, $\ten{Y}$, $\ten{F}$). Boldface indices (e.g. $\mat{i}, \mat{j}, \mat{k}$) denote multi-indices $\mat{i} = (i_1, ..., i_r)\in[\mat{q}]$ where the bracket notation is a shorthand for $[r] = \{1, ..., r\}$ which in conjunction with a multi-index as argument means $[\mat{q}] = [q_1]\times ... \times[q_K] = \{ (i_1, ..., i_r)\in\mathbb{N}^r : 1\leq i_k\leq q_k, \forall k = 1, ..., r  \}$.

Let $\ten{A} = (a_{i_1,...,i_r})\in\mathbb{R}^{q_1\times ...\times q_r}$ be an order\footnote{Also called rank, therefore the variable name $r$, but this term is \emph{not} used as it leads to confusion with the rank as in ``the rank of a matrix''.} $r$ tensor where $r\in\mathbb{N}$ is the number of modes or axis of $\ten{A}$. For matrices $\mat{B}_k\in\mathbb{R}^{p_k\times q_k}$ with $k\in[r] = \{1, 2, ..., r\}$ the \emph{multi-linear multiplication} is defined element wise as
\begin{displaymath}
    (\ten{A}\times\{\mat{B}_1, ..., \mat{B}_r\})_{j_1, ..., j_r} = \sum_{i_1, ..., i_r = 1}^{q_1, ..., q_r} a_{i_1, ..., i_r}(B_{1})_{j_1, i_1} \cdots (B_{r})_{j_r, i_r}
\end{displaymath}
which results in an order $r$ tensor of dimensions $p_1\times ...\times p_k$. With this the \emph{$k$-mode product} between the tensor $\ten{A}$ with the matrix $\mat{B}_k$ is given by
\begin{displaymath}
    \mat{A}\times_k\mat{B}_k = \ten{A}\times\{\mat{I}_{q_1}, ..., \mat{I}_{q_{k-1}}, \mat{B}_{k}, \mat{I}_{q_{k+1}}, ..., \mat{I}_{q_r}\}.
\end{displaymath}
Furthermore, the notation $\ten{A}\times_{k\in S}$ is a short hand for writing the iterative application if the mode product for all indices in $S\subseteq[r]$. For example $\ten{A}\times_{k\in\{2, 5\}}\mat{B}_k = \ten{A}\times_2\mat{B}_2\times_5\mat{B}_5$. By only allowing $S$ to be a set, this notation is unambiguous, because the mode products commutes for different modes $j\neq k\Rightarrow\ten{A}\times_j\mat{B}_j\times_k\mat{B}_k = \ten{A}\times_k\mat{B}_k\times_j\mat{B}_j$.

The \emph{inner product} between two tensors of the same order and dimensions is
\begin{displaymath}
    \langle\ten{A}, \ten{B}\rangle = \sum_{i_1, ..., i_r} a_{i_1, ..., i_r}b_{i_1, ..., i_r}
\end{displaymath}
with which the \emph{Frobenius Norm} $\|\ten{A}\|_F = \sqrt{\langle\ten{A}, \ten{A}\rangle}$. Of interest is also the \emph{maximum norm} $\|\ten{A}\|_{\infty} = \max_{i_1, ..., i_K} a_{i_1, ..., i_K}$. Furthermore, the Frobenius and maximum norm are also used for matrices while for a vector $\mat{a}$ the \emph{2 norm} is $\|\mat{a}\|_2 = \sqrt{\langle\mat{a}, \mat{a}\rangle}$.

Matrices and tensor can be \emph{vectorized} by the \emph{vectorization} operator $\vec$. Sometimes only the order of elements in an object are of interest, therefore we use the notation $\ten{A}\equiv \ten{B}$ for objects $\ten{A}, \ten{B}$ of any shape if and only if $\vec{\ten{A}} = \vec{\ten{B}}$. For tensors of order at least $2$ the \emph{flattening} (or \emph{unfolding} or \emph{matricization}) is a reshaping of the tensor into a matrix along an particular mode. For a tensor $\ten{A}$ of order $r$ and dimensions $q_1, ..., q_r$ the $k$-mode unfolding $\ten{A}_{(k)}$ is a $q_k\times \prod_{l=1, l\neq k}q_l$ matrix. For the tensor $\ten{A} = (a_{i_1,...,i_r})\in\mathbb{R}^{q_1, ..., q_r}$ the elements of the $k$ unfolded tensor $\ten{A}_{(k)}$ are
\begin{displaymath}
    (\ten{A}_{(k)})_{i_k, j} = a_{i_1, ..., i_r}\quad\text{ with }\quad j = 1 + \sum_{\substack{l = 1\\l \neq k}}^r (i_l - 1) \prod_{\substack{m = 1\\m\neq k}}^{l - 1}q_m.
\end{displaymath}

The rank of a tensor $\ten{A}$ of dimensions $q_1\times ...\times q_r$ is given by a vector $\rank{\ten{A}} = (a_1, ..., a_r)\in[q_1]\times...\times[q_r]$ where $a_k = \rank(\ten{A}_{(k)})$ is the usual matrix rank of the $k$ unfolded tensor.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{The Model and Sufficient Reductions}\label{sec:GMLM}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The main relation for sufficient dimension reduction under an inverse regression setting is given by the following. If $(\ten{X}, Y)$ is jointly distributed and $\mat{R}$ is a measurable function of $\ten{X}$, a.k.a. a statistic. Then
\begin{align*}
    Y\mid\ten{X} &\sim Y\mid \mat{R}(\ten{X})
        & &\Leftrightarrow &
    \ten{X}\mid(Y, \mat{R}(\ten{X})) &\sim \ten{X}\mid\mat{R}(\ten{X})
\end{align*}
This means that a sufficient statistic $\mat{R}(\ten{X})$ for $Y$ in the inverse regression $\ten{X}\mid Y$ yields a sufficient reduction for $\ten{X}$ in the forward regression $Y\mid\ten{X}$.

Let $(\ten{X}, Y)$ be jointly distributed with $\ten{X}$ be a random order $r$ tensor of dimensions $p_1\times ...\times p_r$ and $Y$ arbitrary as we assume a known tensor valued function of $Y$ denoted $\ten{F}_Y$ and has order $r$ and dimensions $q_1\times ... \times q_r$. Furthermore, $\E\ten{F}_Y = \mat{0}$ from now on and denote $p = \prod_{k = 1}^r p_k$ and $q = \prod_{k = 1}^r q_k$ as well as $\mat{p} = (p_1, ..., p_r)$ and $\mat{q} = (q_1, ..., q_r)$.

We assume that $\ten{X}\mid Y$ follows a quadratic exponential family distribution. The restriction to a \emph{quadratic} exponential family is to properly model the tensor structure of $\ten{X}$. The pdf (pmf) with parameters $\mat{\theta}_y$ and natural parameters $\mat{\eta}_y = \mat{\eta}(\mat{\theta}_y)$ is given by
\begin{align}
    f_{\mat{\theta}_y}(\ten{X}\mid Y = y)
        &= h(\ten{X})\exp(\t{\mat{\eta}(\mat{\theta}_y)}\mat{t}(\ten{X}) - b(\mat{\theta}_y)) \nonumber \\
        &= h(\ten{X})\exp(\t{\mat{\eta}_{y,1}}\vec(\ten{X}) + \t{\mat{\eta}_{y,2}}(\vec(\ten{X})\otimes\vec(\ten{X})) - b(\mat{\theta}_y)) \nonumber \\
        &= h(\ten{X})\exp(\t{\mat{\eta}_{y,1}}\mat{t}_1(\ten{X}) + \t{\mat{\eta}_{y,2}}\mat{t}_2(\ten{X}) - b(\mat{\theta}_y)).\label{eq:exp-family}
\end{align}
Under \eqref{eq:exp-family} the first natural parameter vector $\mat{\eta}_{y,1}$ is a $p$ dimensional vector while the second natural parameter vector $\mat{\eta}_{y,2}$ consists of $p^2$ elements. This is not the most compact form as its possible to only use $p(p + 1) / 2$ values by using the $\vech(\vec(X)\t{\vec(X)})$ as statistic for the second natural parameter. The reason for choosing $\mat{t}_2(\ten{X}) = \vec(\ten{X})\otimes\vec(\ten{X})$ instead is of technical nature as it simplifyes modeling the tensor structure of $\ten{X}$.

In analogy to a GLM we model the natural parameters as a multi-linear function of $\ten{F}_y$, which is a tensor of dimensions $q_1\times ... \times q_r$ of known functions of $y$ such that $\E_Y\ten{F}_Y = 0$.
\begin{equation}\label{eq:eta_y1}
    \mat{\eta}_1(\mat{\theta}_y) = \mat{\eta}_{y,1} \equiv c_1 (\overline{\ten{\eta}}_1 + \ten{F}_y\times_{k\in[r]}\mat{\alpha}_k)
\end{equation}
where $\overline{\ten{\eta}}_1$ is a intercept parameter of the same shape as $\ten{X}$ while the linear predictors $\mat{\alpha}_k$, for $k = 1, ..., r$, are unconstraint $p_k\times q_k$ dimensional matrices. The scalar $c_1$ is known and eases modeling for specific distributions. The second natural parameter vector $\mat{\eta}_{y,2}$ is assumed to have the form
\begin{equation}\label{eq:eta_y2}
    \mat{\eta}_2(\mat{\theta}_y) = \mat{\eta}_{y,2} \equiv c_2 \bigotimes_{k = r}^1 \mat{\Omega}_k
\end{equation}
with $\mat{\Omega}_k$ beeing $p_k\times p_k$ symmetric matrices for $k = 1, ..., k$.

In a classical GLM we also have an invertible link function $\mat{g}$ connectiong the natrual parameters to the expectation of the exponential family statistis as $\mat{\eta}_y = \mat{g}(\E[\mat{t}(\ten{X}) \mid Y = y])$. Such a link may not exist, but for our purposes its suffices to have the ``inverse'' link. The ``inverse'' link $\invlink$ does exist as the natural parameters fully describe the distribution
\begin{displaymath}
    \invlink(\mat{\eta}_y) = \E[\mat{t}(\ten{X}) \mid Y = y]
\end{displaymath}
which we also split into two parts as $\invlink_1(\mat{\eta}_y) = \E[\mat{t}_1(\ten{X}) \mid Y = y] \equiv \E[\ten{X} \mid Y = y]$ and $\invlink_2(\mat{\eta}_y) = \E[\mat{t}_2(\ten{X}) \mid Y = y] = \E[\vec(\ten{X})\otimes\vec(\ten{X}) \mid Y = y]$.

We denote this GLM like model as \emph{generalized multi-linear model} (GMLM).

\begin{theorem}[SDR]\label{thm:sdr}
    A sufficient reduction for the regression $Y\mid \ten{X}$ under the quadratic exponential family inverse regression model \eqref{eq:exp-family} is given by
    \begin{align*}
        \mat{R}(\ten{X})
            &= \t{\mat{\beta}}(\vec{\ten{X}} - \E\vec{\ten{X}}) \\
            &\equiv \ten{X}\times_{k\in[r]}\t{\mat{\alpha}_k}.
    \end{align*}
    for a $p\times q$ dimensional matrix $\mat{\beta}=\bigotimes_{k = r}^{1}\mat{\alpha}_k$ which satisfies $\Span(\mat{\beta}) = \Span(\{\mat{\eta}_{Y,1} - \E_{Y}\mat{\eta}_{Y,1} : Y\in\mathcal{S}_Y\})$.
\end{theorem}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
                        \section{Estimation}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
For parameter estimation of the GMLM parameters from Section~\ref{sec:GMLM} we use maximum likelihood estimation.

\begin{theorem}[Log-Likelihood and Score]\label{thm:grad}
    For $n$ i.i.d. observations $(\ten{X}_i, y_i), i = 1, ..., n$ the log-likelihood has the form
    \begin{equation}\label{eq:likelihood}
        l(\mat{\Theta}_y) = \sum_{i = 1}^n(\log h(\ten{X}_i) + c_1\langle\overline{\ten{\eta}}_1 + \ten{F}_{y_i}\times_{k\in[r]}\mat{\alpha}_k, \ten{X}_i \rangle + c_2\langle\ten{X}_i\times_{k\in[r]}\mat{\Omega}_k, \ten{X}_i \rangle - b(\mat{\eta}_{y_i})).
    \end{equation}
    with $\mat{\Theta}_y$ being the collection of all GMLM parameters $\overline{\mat{\eta}}$ and $\mat{\alpha}_k, \mat{\Omega}_k$ for $k = 1, ..., r$.
    The partial gradients with respect to the GMLM parameters are given by
    \begin{align*}
        % \nabla_{\overline{\ten{\eta}}_1}l &= c_1\sum_{i = 1}^n \reshape{\mat{p}}(\mat{t}_1(\ten{X}_i) - \invlink_1(\mat{\eta}_{y_i})), \\
        \nabla_{\overline{\ten{\eta}}_1}l &\equiv c_1\sum_{i = 1}^n (\mat{t}_1(\ten{X}_i) - \invlink_1(\mat{\eta}_{y_i})), \\
        \nabla_{\mat{\alpha}_j}l &= c_1 \sum_{i = 1}^n \reshape{\mat{p}}(\mat{t}_1(\ten{X}_i) - \invlink_1(\mat{\eta}_{y_i}))_{(j)}\t{(\ten{F}_{y_i}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}}, \\
        \nabla_{\mat{\Omega}_j}l &\equiv c_2 \mat{D}_{p_j}\t{\mat{D}_{p_j}} \reshape{(\mat{p}, \mat{p})}\!\!\Big(\sum_{i = 1}^n(\mat{t}_2(\ten{X}_i) - \invlink_2(\mat{\eta}_{y_i}))\Big)_{(j, r + j)}\vec\bigotimes_{\substack{k = r\\k\neq j}}^{1}\mat{\Omega}_k
    \end{align*}
\end{theorem}

A straight forward idea for parameter estimation is to use Gradient Descent. For pure algorithmic speedup, by only changing the update rule but \emph{not} the gradient denoted $\nabla_{\mat{\Theta}}l$ of the objective function, we use Nesterov Accelerated Gradient Descent with an internal Line Search to determine the step size in each iteration, Algorithm~\ref{alg:NAGD}.

% Denote with $l(\mat{\Theta}\mid \ten{X}, \ten{F}_y)$ the negative log-likelihood \eqref{eq:likelihood} which is minimization objective from the view of the Algorithm~\ref{alg:NAGD}.

\begin{algorithm}[ht]
    \caption{\label{alg:NAGD}Nesterov Accelerated Gradient Descent}
    \begin{algorithmic}[1]
        \State Objective: $l(\Theta \mid \ten{X}, \ten{F}_y)$
        \State Arguments: Order $r + 1$ tensors $\ten{X}$, $\ten{F}$
        \State Initialize: Parameters $\Theta^{(0)}$, $0 < c, \delta^{(1)}$ and $0 < \gamma < 1$
        \\
        \State $t \leftarrow 1$
            \Comment{step counter}
        \State $\mat{\Theta}^{(1)} \leftarrow \mat{\Theta}^{(0)}$
            \Comment{artificial first step}
        \State $(m^{(0)}, m^{(1)}) \leftarrow (0, 1)$
            \Comment{momentum extrapolation weights}
        \\
        \Repeat \Comment{repeat untill convergence}
            \State $\mat{M} \leftarrow \mat{\Theta}^{(t)} + \frac{m^{(t - 1)} - 1}{m^{(t)}}(\mat{\Theta}^{(t)} - \mat{\Theta}^{(t - 1)})$ \Comment{momentum extrapolation}
            \For{$\delta = \gamma^{-1}\delta^{(t)}, \delta^{(t)}, \gamma\delta^{(t)}, \gamma^2\delta^{(t)}, ...$} \Comment{Line Search}
                \State $\mat{\Theta}_{\text{temp}} \leftarrow \mat{M} + \delta \nabla_{\mat{\Theta}} l(\mat{M})$
                \If{$l(\mat{\Theta}_{\text{temp}}) \leq l(\mat{\Theta}^{(t - 1)}) - c \delta \|\nabla_{\mat{\Theta}} l(\mat{M})\|_F^2$} \Comment{Armijo Condition}
                    \State $\mat{\Theta}^{(t + 1)} \leftarrow \mat{\Theta}_{\text{temp}}$
                    \State $\delta^{(t + 1)} \leftarrow \delta$
                    \Break
                \EndIf
            \EndFor
            \State $m^{(t + 1)} \leftarrow \frac{1 + \sqrt{1 + (2 m^{(t)})^2}}{2}$ \Comment{update extrapolation weights}
            \State $t \leftarrow t + 1$
        \Until converged
    \end{algorithmic}
\end{algorithm}

Algorithm~\ref{alg:NAGD} requires initial parameter estimates. Therefore, we choose a simple heuristic approach by starting with $\mat{\Omega}_k^{(0)} = \mat{I}_{p_k}$ and $\overline{\ten{\eta}}_1^{(0)}$ dependent on the particular distribution. For example for a Tensor Normal distribution the sample mean is a good choice while for the Ising model $0$ has nice properties. The interesting part are the multi-linear predictors $\mat{\alpha}_k$. Therefore, let $\mat{\Sigma}_k = \frac{q_k}{n q}\sum_{i = 1}^n {\ten{F}_{y_i}}_{(k)}\t{{\ten{F}_{y_i}}_{(k)}}$ and $\mat{\Delta}_k = \frac{p_k}{n p}\sum_{i = 1}^n {\ten{X}_{i}}_{(k)}\t{{\ten{X}_{i}}_{(k)}}$ mode wise sample covariance estimates. For both we take the $s_k = \min(p_k, q_k)$ order SVD approximation as $\mat{\Delta}_k \approx \mat{U}_k\mat{D}_k\t{\mat{U}_k}$ and $\mat{\Sigma}_k \approx \mat{V}_k\mat{S}_k\t{\mat{V}_k}$. The approximate SVD gives us the $s_k$ first singular vectors with dimensions $\mat{U}_k(p_k\times s_k), \mat{D}_k(s_k\times s_k)$ and $\mat{V}_k(q_k\times s_k), \mat{S}_k(s_k\times s_k)$. Then we initialize
%%%%%%%%%%%%%%%%%%%%%%%%% TODO: !!!! \todo{!!!!!}
\begin{displaymath}
    \mat{\alpha}_k^{(0)} = \mat{U}_k\sqrt{\mat{D}_k}\sqrt{\mat{S}_k}\t{\mat{V}_k}.
\end{displaymath}

% An alternative is to use Iterative Re-weighted least Squares, but his requires the Fisher Information which has dimensions $s\times s$ for $s = p + \sum_{k = 1}^r (p_k q_k + p_k^2)$. This means that for the computation of the Fisher Information we have a computational complexity of $\mathcal{O}()$

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
                        \section{Examples}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
We illustrate the SDR method on two special cases, first the Tensor Normal distribution and second on the Multi-Variate Bernoulli distribution with vector, matrix and tensor valued predictors.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Tensor Normal}

Let $\ten{X}, \ten{F}_y$ be order $r$ tensors of dimensions $p_1\times ... \times p_r$ and $q_1\times ... \times q_r$, respectively. We assume the inverse regression model for $\ten{X}\mid Y = y$ to be tensor normal distributed with density
\begin{displaymath}
    f_{\mat{\theta}_y}(\ten{X}\mid Y = y) = (2\pi)^{-p/2}\prod_{k = 1}^r |\mat{\Delta}_{k}|^{-p / 2 p_{k}}\exp\Big(
        -\frac{1}{2}\langle \ten{X} - \ten{\mu}_y, (\ten{X} - \ten{\mu}_y)\times_{k\in[r]}\mat{\Delta}_{k}^{-1} \rangle
    \Big)
\end{displaymath}
with location parameter tensor $\ten{\mu}_y$ depending on $y$ and the symmetric covariance matrices $\mat{\Delta}_{k}$ for each of the $k\in[r]$ modes (independent of $y$) collected in the parameter vector $\mat{\theta}_y = (\ten{\mu}_y, \mat{\Delta}_1, ..., \mat{\Delta}_r)$. Rewriting into the form of an quadratic exponential family leads to
\begin{align*}
    f_{\mat{\theta}_y}(\ten{X}\mid Y = y)
        &= (2\pi)^{-p/2} \exp\Big(
            -\frac{1}{2}\langle \ten{X}, \ten{X}\times_{k\in[r]}\mat{\Delta}_{k}^{-1} \rangle
            +\langle \ten{X}, \ten{\mu}_y\times_{k\in[r]}\mat{\Delta}_k^{-1} \rangle \\
            &\makebox[10em]{}-\frac{1}{2}\langle \ten{\mu}_y, \ten{\mu}_y\times_{k\in[r]}\mat{\Delta}_{k}^{-1} \rangle
            -\sum_{k = 1}^r \frac{p}{2 p_{k}}\log|\mat{\Delta}_k|
        \Big) \\
        &= h(\ten{X})\exp(\t{\mat{{\eta}}(\mat{\theta}_y)}\mat{t}(\ten{X}) - b(\mat{\theta}_y)).
\end{align*}
Identifying the exponential family components gives
\begin{align*}
    h(\ten{X}) &= (2\pi)^{-p/2} \\
    b(\mat{\theta}_y) &= \frac{1}{2}\langle \ten{\mu}_y, \ten{\mu}_y\times_{k\in[r]}\mat{\Delta}_{k}^{-1} \rangle + \sum_{k = 1}^r \frac{p}{2 p_{k}}\log|\mat{\Delta}_{k}|
\end{align*}
and
\begin{align*}
    \mat{\eta}(\mat{\theta}_y) &= (\mat{\eta}_1(\mat{\theta}_y); \mat{\eta}_2(\mat{\theta}_y)) &
    \mat{t}(\ten{X}) &= (\mat{t}_1(\ten{X}); \mat{t}_2(\ten{X}))
\end{align*}
where
\begin{align*}
    \mat{\eta}_1(\mat{\theta}_y) = \mat{\eta}_{y,1} &= \vec(\ten{\mu}_y\times_{k\in[r]}\mat{\Delta}_{k}^{-1}), &
    \mat{t}_1(\ten{X}) &= \vec\ten{X}, \\
    \mat{\eta}_2(\mat{\theta}_y) = \mat{\eta}_{y,2} &= -\frac{1}{2}\vec\bigotimes_{k = r}^{1}\mat{\Delta}_{k}^{-1}, &
    \mat{t}_2(\ten{X}) &= \vec\ten{X}\otimes\vec\ten{X}.
\end{align*}
The natural parameters are models as described in the Multi-Linear GLM as
\begin{align*}
    \mat{\eta}_{y,1} &= \vec(\overline{\ten{\eta}}_1 + \ten{F}_y\times_{k\in[r]}\mat{\alpha}_{k}) \\
    \mat{\eta}_{y,2} &= -\frac{1}{2}\vec\bigotimes_{k = r}^{1}\mat{\Omega}_{k}.
\end{align*}
The intercept parameter $\overline{\ten{\eta}}_1$ is of the same dimensions as $\ten{X}$ and the reduction matrices $\mat{\alpha}_j$ are of dimensions $p_j\times q_j$ while the symmetric $\mat{\Omega}_j$ are of dimensions $p_j\times p_j$. The inverse relation from the GLM parameters to the tensor normal parameters is
\begin{align*}
    \ten{\mu}_y &= (\overline{\ten{\eta}}_1 + \ten{F}_y\times_{j\in[r]}\mat{\alpha}_{j})\times_{k\in[r]}\mat{\Omega}_{k}^{-1} = (\unvec(-2\mat{\eta}_{y,2}))^{-1}\mat{\eta}_{y,1} \\
    \mat{\Delta}_{k} &= \mat{\Omega}_{k}^{-1}
\end{align*}
for each $j\in[r]$. The inverse link is given by
\begin{displaymath}
    \invlink(\mat{\eta}_y) = \E_{\mat{\theta}_y}[\mat{t}(\ten{X})\mid Y = y]
\end{displaymath}
consisting of the first and second (uncentered) vectorized moments of the tensor normal distribution.
\begin{align*}
    \D b(\mat{\eta}_{y,1}) \equiv \invlink_1(\mat{\eta}_y) &\equiv \E[\ten{X} \mid Y = y] = \ten{\mu}_y \\
        &= (\ten{\eta}_1 + \ten{F}_y\times_{k\in[r]}\mat{\alpha}_k) \times_{l\in[k]}\mat{\Omega}_k^{-1} \\
    \D b(\mat{\eta}_{y,2}) \equiv \invlink_2(\mat{\eta}_y) &\equiv \E[\vec(\ten{X})\t{\vec(\ten{X})} \mid Y = y] \\
        &= \cov(\vec{\ten{X}} \mid Y = y) + \vec(\ten{\mu}_y)\t{\vec(\ten{\mu}_y)} \\
        &= \bigotimes_{k = r}^{1}\mat{\Omega}_k^{-1} + \vec(\ten{\mu}_y)\t{\vec(\ten{\mu}_y)}
\end{align*}
Furthermore, the explicit parameterization of the second moments
\begin{displaymath}
    \cov(\mat{t}(\ten{X})\mid Y = y) = \H b(\mat{\eta}_y)
    = \begin{pmatrix}\mat{H}_{1,1} & \mat{H}_{1,2} \\ \mat{H}_{2,1} & \mat{H}_{2,2}\end{pmatrix}
    % = \begin{pmatrix}\mat{H}_{1,1} & \mat{H}_{1,2} \\ \mat{H}_{2,1} & \mat{H}_{2,2}\end{pmatrix} = \cov(\mat{t}(\ten{X})\mid Y = y) = \begin{pmatrix}
    %     \cov(\mat{t}_1(\ten{X})\mid Y = y) & \cov(\mat{t}_1(\ten{X}), \mat{t}_2(\ten{X})\mid Y = y) \\
    %     \cov(\mat{t}_2(\ten{X}), \mat{t}_1(\ten{X})\mid Y = y) & \cov(\mat{t}_2(\ten{X})\mid Y = y)
    % \end{pmatrix}
\end{displaymath}
are given by
\begin{align*}
    \mat{H}_{1,1} &= \cov(\vec{\ten{X}}\mid Y = y) = \bigotimes_{k = r}^1 \mat{\Omega}_k^{-1} = \bigotimes_{k = r}^1 \mat{\Delta}_k \\
    \mat{H}_{2,1} = \t{\mat{H}_{1,2}} &= \cov(\vec{\ten{X}}\otimes \vec{\ten{X}}, \vec{\ten{X}}\mid Y = y) \\
        &=
\end{align*}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Ising Model}
The conditional Ising model for the inverse regression $\ten{X}\mid Y = y$ with $p (p + 1) / 2$ parameters $\mat{\theta}_y$ is given by
\begin{align*}
    P_{\mat{\theta}_y}(\ten{X}\mid Y = y)
        &= p_0(\mat{\theta}_y)\exp(\t{\vech(\vec(\ten{X})\t{\vec(\ten{X})})}\mat{\theta}_y) \\
        &= h(\ten{X})\exp(\t{\mat{{\eta}}(\mat{\theta}_y)}\mat{t}(\ten{X}) - b(\mat{\theta}_y))
\end{align*}
where $h(\ten{X}) = 1$ and $b(\mat{\theta}_y) = -\log p_0(\mat{\theta}(\mat{\eta}_y))$.
Following the GMLM model we have model the natural parameters as
\begin{align*}
    \mat{\eta}_{y,1} &\equiv c_1 (\overline{\ten{\eta}}_1 + \ten{F}_y\times_{k\in[r]}\mat{\alpha}_k), &
    \mat{\eta}_{y,2} &\equiv c_2 \bigotimes_{k = r}^{1}\mat{\Omega}_k.
\end{align*}
where we set the constants $c_1 = c_2 = 1$. This yields the following relation to the conditional Ising model parameters
\begin{displaymath}
    \mat{\theta}_y = \mat{\theta}(\mat{\eta}_y) = \vech(\diag(\mat{\eta}_{y,1}) + (1_{p\times p} - \mat{I}_p) \odot \reshape{(p, p)}(\mat{\eta}_{y,2}))
\end{displaymath}
Note that the diagonal elements of the $\mat{\Omega}_k$ are multiplied by $0$ which means they are ignored. This reflects the fact that under the Ising model (in general for the multivariate Bernoulli) holds $\E \ten{X}_{\mat{j}}\mid Y = \E \ten{X}_{\mat{j}}\ten{X}_{\mat{j}} \mid Y$ due to $0$ and $1$ entries only. Therefore our model over-parameterize as the diagonal elements of $\mat{\Omega}_k$ and $\overline{\ten{\eta}}_1$ serve the same purpose.

\section{Simulations}

\subsection{Simulation 1: Tensor Normal}
$\ten{X}$ is a $2\times 3\times 5$ tensor, $y\in\{1, 2, ..., 6\}$ uniformly distributed and $\ten{F}_y$ is a $1\times 2\times 3$ tensor with $0$ or $1$ entries $(\vec{\ten{F}}_y)_j = \delta_{y,j}$. The GMLM parameters are simply $\overline{\ten{\eta}}_1 = 0$, the matrices $\mat{\alpha}_k$ are filled with independent standard normal entries and the $\Omega_k$ are $AR_{p_k}(0.5)$. The $X \mid Y = y$ was drawn from a Tensor Normal with mean $\mu_y = \ten{F}_y\times_{k\in[3]}\mat{\Omega}_k^{-1}\mat{\alpha}_k$ and co-variances $\mat{\Omega}_k^{-1}$.

\begin{figure}[!ht]
    \centering
    \includegraphics[width = \textwidth]{sim-normal-20221012.png}
    \caption{\label{fig:sim-normal}Simulation Normal}
\end{figure}

\subsection{Simulation 2: Small Ising}

\begin{figure}[!ht]
    \centering
    \includegraphics[width = \textwidth]{sim-ising-small-20221012.png}
    \caption{\label{fig:sim-ising-small}Simulation Ising Small}
\end{figure}


\printbibliography[heading=bibintoc,title={References}]

\appendix
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Fisher Information}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
We start by computing the \emph{observed information matrix} $\mathcal{J}(\mat{\Theta})$ at parameters $\mat{\Theta} = (\overline{\ten{\eta}}_1$, $\mat{\alpha}_1$, $...$, $\mat{\alpha}_r$, $\mat{\Omega}_1$, $...$, $\mat{\Omega}_r)$ given as a block matrix consisting of $(2 r + 1)^2$ blocks (althoug only $(2 r + 1)(r + 1)$ unique blocks by a block symmetry $\mathcal{J}_{i, j} = \t{\mathcal{J}_{j, i}}$)
\begin{displaymath}
    \mathcal{J}(\mat{\Theta}) = \begin{pmatrix}
        \mathcal{J}_{1,1} &\mathcal{J}_{2,1} & \cdots & \mathcal{J}_{2 r + 1, 1} \\
        \vdots & \vdots & \ddots & \vdots \\
        \mathcal{J}_{2 r + 1,1} & \mathcal{J}_{2 r + 1,1} & \cdots & \mathcal{J}_{2 r + 1, 2 r + 1}
    \end{pmatrix}
\end{displaymath}
where each individual block is given by
\begin{displaymath}
    \mathcal{J}_{j,l} = -\frac{\partial l(\Theta)}{\partial\t{(\vec{\Theta_j})}\partial(\vec{\Theta_l})}.
\end{displaymath}
For example $\mathcal{J}_{1,2} = -\frac{\partial l(\Theta)}{\partial\t{(\vec{\overline{\ten{\eta}}_1})}\partial(\vec{\mat{\alpha}_1})}$ and $\mathcal{J}_{2r + 1, 2r + 1} = -\H l(\mat{\Omega}_r)$.
We start by restating the log-likelihood for a given single observation $(\ten{X}, \ten{Y})$ where $\ten{F}_y$ given by
\begin{displaymath}
    l(\mat{\Theta}) = \log h(\ten{X}) + c_1\big\langle\overline{\ten{\eta}}_1 + \ten{F}_{y}\mlm{k\in[r]}\mat{\alpha}_k, \ten{X}\big\rangle + c_2\big\langle\ten{X}\mlm{k\in[r]}\mat{\Omega}_k, \ten{X}\big\rangle - b(\mat{\eta}_{y})
\end{displaymath}
with
\begin{align*}
    \mat{\eta}_1(\mat{\theta}_y) &= \mat{\eta}_{y,1} \equiv c_1 (\overline{\ten{\eta}}_1 + \ten{F}_y\times_{k\in[r]}\mat{\alpha}_k) \\
    \mat{\eta}_2(\mat{\theta}_y) &= \mat{\eta}_{y,2} \equiv c_2 \bigotimes_{k = r}^1 \mat{\Omega}_k.
\end{align*}
Now let
\begin{displaymath}
    \mat{H} = \H b(\mat{\eta}_y) = \begin{pmatrix}
        \H b(\mat{\eta}_{y,1}) & \D(\D b(\mat{\eta}_{y,1}))(\mat{\eta}_{y,2}) \\ \D(\D b(\mat{\eta}_{y,2}))(\mat{\eta}_{y,1}) & \H b(\mat{\eta}_{y,2})
    \end{pmatrix} = \begin{pmatrix}
        \mat{H}_{1,1} & \mat{H}_{1,2} \\ \mat{H}_{2,1} & \mat{H}_{2,2}
    \end{pmatrix}
\end{displaymath}
as well as tensor valued reshaped versions
\begin{align*}
    \ten{D}_{1} &= \reshape{\mat{p}}(\D b(\mat{\eta}_{y,1})) &
    \ten{D}_{2} &= \reshape{(\mat{p}, \mat{p})}(\D b(\mat{\eta}_{y,2})) \\
    \ten{H}_{1,1} &= \reshape{(\mat{p}, \mat{p})}(\mat{H}_{1,1}) &
    \ten{H}_{1,2} &= \reshape{(\mat{p}, \mat{p}, \mat{p})}(\mat{H}_{1,2}) \\
    \ten{H}_{2,1} &= \reshape{(\mat{p}, \mat{p}, \mat{p})}(\mat{H}_{2,1}) &
    \ten{H}_{2,2} &= \reshape{(\mat{p}, \mat{p}, \mat{p}, \mat{p})}(\mat{H}_{2,2})
\end{align*}
Note that $\ten{D}_{1}$ is of order $r$, $\ten{D}_{2}$ and $\ten{H}_{1,1}$ are tensors of order $2 r$, $\ten{H}_{1,2}$ and $\ten{H}_{2,1}$ have order $3 r$ and $\ten{H}_{2,2}$ is of order $4 r$.
We get the differentials as
\begin{align*}
    \d l(\overline{\ten{\eta}}_1) &= c_1(\langle\d\overline{\ten{\eta}}_1, \ten{X}\rangle - \D b(\mat{\eta}_{y,1})\vec{\d\overline{\ten{\eta}}_1}) \\
    \d l(\mat{\alpha}_j) &= c_1(\t{\vec(\ten{X})} - \D b(\mat{\eta}_{y,1}))\vec(\ten{F}_y\times_{k\in[r]\backslash j}\mat{\alpha}_k\times_j\d\mat{\alpha}_j) \\
    \d l(\mat{\Omega}_j) &= c_2\Big(\langle\ten{X}\times_{k\in[r]\backslash j}\mat{\Omega}_k\times_j\d\mat{\Omega}_j, \ten{X}\rangle - \D b(\mat{\eta}_{y,2})\vec\!\Big(\bigotimes_{k = r}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_j\otimes\bigotimes_{k=j-1}^{1}\mat{\Omega}_k\Big)\Big)
\end{align*}
Before rewriting the differentials into a form which reveals the derivatives we compute the second order differentials.
{\allowdisplaybreaks\begin{align*}
    \d^2 l(\overline{\ten{\eta}}_1)
        &= -c_1^2 \t{\d\vec(\overline{\ten{\eta}}_1)}\mat{H}_{1,1}\d\vec(\overline{\ten{\eta}}_1) \\
    \d^2 l(\overline{\ten{\eta}}_1, \mat{\alpha}_j)
        &= -c_1^2 \t{\vec(\ten{F}_y\times_{k\in[r]\backslash j}\mat{\alpha}_k\times_j\d\mat{\alpha}_j)}\mat{H}_{1,1}\d\vec(\overline{\ten{\eta}}_1) \\
    \d^2 l(\overline{\ten{\eta}}_1, \mat{\Omega}_j)
        &= -c_1 c_2 \t{\vec\!\Big(\bigotimes_{k = r}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_j\otimes\bigotimes_{k=j-1}^{1}\mat{\Omega}_k\Big)}\mat{H}_{2,1}\d\vec(\overline{\ten{\eta}}_1) \\
    \d^2 l(\mat{\alpha}_j)
        &= -c_1^2 \t{\vec(\ten{F}_y\times_{k\in[r]\backslash j}\mat{\alpha}_k\times_j\d\mat{\alpha}_j)}\mat{H}_{1,1}\vec(\ten{F}_y\times_{k\in[r]\backslash j}\mat{\alpha}_k\times_j\d\mat{\alpha}_j) \\
    \d^2 l(\mat{\alpha}_j, \mat{\alpha}_l)
        &\overset{\makebox[0pt]{\scriptsize $j\neq l$}}{=} -c_1^2 \t{\vec(\ten{F}_y\times_{k\in[r]\backslash l}\mat{\alpha}_k\times_l\d\mat{\alpha}_l)}\mat{H}_{1,1}\vec(\ten{F}_y\times_{k\in[r]\backslash j}\mat{\alpha}_k\times_j\d\mat{\alpha}_j) \\
        &\qquad + c_1(\t{(\vec{\ten{X}})} - \D b(\mat{\eta}_{y,1}))\vec(\ten{F}_y\times_{k\in[r]\backslash\{j, l\}}\mat{\alpha}_k\times_j\d\mat{\alpha}_j\times_l\d\mat{\alpha}_l) \\
    \d^2 l(\mat{\alpha}_j, \mat{\Omega}_l)
        &= -c_1 c_2 \t{\vec\!\Big(\bigotimes_{k = r}^{l + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_l\otimes\bigotimes_{k=l-1}^{1}\mat{\Omega}_k\Big)}\mat{H}_{2,1}\vec(\ten{F}_y\times_{k\in[r]\backslash j}\mat{\alpha}_k\times_j\d\mat{\alpha}_j) \\
    \d^2 l(\mat{\Omega}_j)
        &= -c_2^2 \t{\vec\!\Big(\bigotimes_{k = r}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_j\otimes\bigotimes_{k=j-1}^{1}\mat{\Omega}_k\Big)}\mat{H}_{2,2}\vec\!\Big(\bigotimes_{k = r}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_j\otimes\bigotimes_{k=j-1}^{1}\mat{\Omega}_k\Big) \\
    \d^2 l(\mat{\Omega}_j, \mat{\Omega}_l)
        &\overset{\makebox[0pt]{\scriptsize $j < l$}}{=} c_2 \langle\ten{X}\times_{k\in[r]\backslash \{j, l\}}\mat{\Omega}_k\times_j\d\mat{\Omega}_j\times_l\d\mat{\Omega}_l, \ten{X}\rangle \\
        &\qquad - c_2^2 \t{\vec\!\Big(\bigotimes_{k = r}^{l + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_l\otimes\bigotimes_{k=l-1}^{1}\mat{\Omega}_k\Big)}\mat{H}_{2,2}\vec\!\Big(\bigotimes_{k = r}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_j\otimes\bigotimes_{k=j-1}^{1}\mat{\Omega}_k\Big) \\
        &\qquad\qquad - c_2 \D b(\mat{\eta}_{y,2})\vec\!\Big(\bigotimes_{k = r}^{l + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_{l}\otimes\bigotimes_{k = l - 1}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_{j}\otimes\bigotimes_{k=j-1}^{1}\mat{\Omega}_k\Big)
\end{align*}}%
Now we rewrite all the above differentials to extract the derivatives one at a time with extensive usage of the identities from Theorem~\ref{thm:mlm_mat} and Theorem~\ref{thm:mtvk_rearrange}. We start with the first order differentials
{\allowdisplaybreaks\begin{align*}
    \d l(\overline{\ten{\eta}}_1) &= c_1\t{\vec(\ten{X} - \ten{D}_1)}\vec{\d\overline{\ten{\eta}}_1} \\
        &\qquad\Rightarrow \D l(\overline{\ten{\eta}}_1) \equiv c_1 (\ten{X} - \ten{D}_1) \\
%
    \d l(\mat{\alpha}_j) &= c_1 \tr((\ten{X} - \ten{D}_1)_{(j)}\t{(\ten{F}_y\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}}\t{\d\mat{\alpha}_j}) \\
        &\qquad\Rightarrow \D l(\mat{\alpha}_j) \equiv c_1 (\ten{X} - \ten{D}_1)_{(j)}\t{(\ten{F}_y\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}} \\
%
    \d l(\mat{\Omega}_j) &= c_2\Big(\langle\ten{X}\times_{k\in[r]\backslash j}\mat{\Omega}_k\times_j\d\mat{\Omega}_j, \ten{X}\rangle - \D b(\mat{\eta}_{y,2})\vec\!\Big(\bigotimes_{k = r}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_j\otimes\bigotimes_{k=j-1}^{1}\mat{\Omega}_k\Big)\Big) \\
        &= c_2 \t{(\vec{\ten{X}}\otimes\vec{\ten{X}} - (\ten{D}_2)_{([2r])})}\vec\!\Big(\bigotimes_{k = r}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_j\otimes\bigotimes_{k=j-1}^{1}\mat{\Omega}_k\Big) \\
        &= c_2 (\ten{X}\otimes\ten{X} - \ten{R}_{[2r]}(\ten{D}_2))\mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})}\times_j\t{(\vec{\d\mat{\Omega}_j})} \\
        &= c_2 \t{\vec\Bigl((\ten{X}\otimes\ten{X} - \ten{R}_{[2r]}(\ten{D}_2))\mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})}\Bigr)}\vec{\d\mat{\Omega}_j} \\
        &= c_2 \t{\vec\Bigl((\ten{X}\otimes\ten{X} - \ten{R}_{[2r]}(\ten{D}_2))\mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})}\Bigr)}\mat{D}_{p_j}\t{\mat{D}_{p_j}}\vec{\d\mat{\Omega}_j} \\
            &\qquad\Rightarrow \D l(\mat{\Omega}_j) = c_2 \t{\vec\Bigl((\ten{X}\otimes\ten{X} - \ten{R}_{[2r]}(\ten{D}_2))\mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})}\Bigr)}\mat{D}_{p_j}\t{\mat{D}_{p_j}}
\end{align*}}%
The next step is to identify the Hessians from the second differentials in a similar manner as befor.
{\allowdisplaybreaks\begin{align*}
    &\d^2 l(\overline{\ten{\eta}}_1) \\
        &= -c_1^2 \t{(\vec{\d\overline{\ten{\eta}}_1})}\mat{H}_{1,1}\vec{\d\overline{\ten{\eta}}_1} \\
            &\qquad\Rightarrow \H l(\overline{\ten{\eta}}_1) = -c_1^2 \mat{H}_{1,1}
            \qquad{\color{gray} (p \times p)}
        \\
    &\d^2 l(\overline{\ten{\eta}}_1, \mat{\alpha}_j) \\
        &= -c_1^2 \t{\vec(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\times_j\d\mat{\alpha}_j)}\mat{H}_{1,1}\vec{\d\overline{\ten{\eta}}_1} \\
        &= -c_1^2 \t{\vec(\d\mat{\alpha}_j(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)})}\mat{K}_{p,(j)}\mat{H}_{1,1}\vec{\d\overline{\ten{\eta}}_1} \\
        &= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})}((\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}\otimes\mat{I}_{p_j})(\ten{H}_{1,1})_{((j, [r]\backslash j))}\vec{\d\overline{\ten{\eta}}_1} \\
        &= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} ( (\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k) \ttt_{[r]\backslash j} \ten{H}_{1,1})_{((2, 1))} \vec{\d\overline{\ten{\eta}}_1} \\
            &\qquad\Rightarrow \frac{\partial l}{\partial(\vec{\mat{\alpha}_j})\t{\partial(\vec{\overline{\ten{\eta}}_1)}}} = -c_1^2 ( (\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k) \ttt_{[r]\backslash j} \ten{H}_{1,1})_{((2, 1))}
            \qquad{\color{gray} (p_j q_j \times p)}
        \\
    &\d^2 l(\overline{\ten{\eta}}_1, \mat{\Omega}_j) \\
        &= -c_1 c_2 \t{\vec\!\Big(\bigotimes_{k = r}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_j\otimes\bigotimes_{k=j-1}^{1}\mat{\Omega}_k\Big)}\mat{H}_{2,1}\vec{\d\overline{\ten{\eta}}_1} \\
        &= -c_1 c_2 \t{\Big[ \t{(\ten{H}_{2,1})_{([2r])}} \vec\!\Big(\bigotimes_{k = r}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_j\otimes\bigotimes_{k=j-1}^{1}\mat{\Omega}_k\Big) \Big]} \vec{\d\overline{\ten{\eta}}_1} \\
        &= -c_1 c_2 \t{\vec( \ten{R}_{[2r]}(\ten{H}_{2,1}) \mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})}\times_j\t{(\vec{\d\mat{\Omega}_j})} )} \vec{\d\overline{\ten{\eta}}_1} \\
        &= -c_1 c_2 \t{(\vec{\d\mat{\Omega}_j})} ( \ten{R}_{[2r]}(\ten{H}_{2,1}) \mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})} )_{(j)} \vec{\d\overline{\ten{\eta}}_1} \\
            &\qquad\Rightarrow \frac{\partial l}{\partial(\vec{\mat{\Omega}_j})\t{\partial(\vec{\overline{\ten{\eta}}_1)}}} = -c_1 c_2 \mat{D}_{p_j}\t{\mat{D}_{p_j}}( \ten{R}_{[2r]}(\ten{H}_{2,1}) \mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})} )_{(j)}
            \qquad{\color{gray} (p_j^2 \times p)}
        \\
    &\d^2 l(\mat{\alpha}_j) \\
        &= -c_1^2 \t{\vec(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\times_j\d\mat{\alpha}_j)}\mat{H}_{1,1}\vec(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\times_j\d\mat{\alpha}_j) \\
        &= -c_1^2 \t{\vec(\d\mat{\alpha}_j(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)})}\mat{K}_{\mat{p},(j)}\mat{H}_{1,1}\t{\mat{K}_{\mat{p},(j)}}\vec(\d\mat{\alpha}_j(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}) \\
        &= -c_1^2 \t{[((\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}\otimes\mat{I}_{p_j})\vec{\d\mat{\alpha}_j}]}\mat{K}_{\mat{p},(j)}\mat{H}_{1,1}\t{\mat{K}_{\mat{p},(j)}}((\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}\otimes\mat{I}_{p_j})\vec{\d\mat{\alpha}_j} \\
        &= -c_1^2 \t{[((\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}\otimes\mat{I}_{p_j})\vec{\d\mat{\alpha}_j}]}(\ten{H}_{1,1})_{((j,[r]\backslash j),(j,[r]\backslash j))}((\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}\otimes\mat{I}_{p_j})\vec{\d\mat{\alpha}_j} \\
        &= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})}[ ((\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k)\ttt_{[r]\backslash j}\ten{H}_{1,1})\ttt_{[r]\backslash j + 2,[r]\backslash j}(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k) ]_{((2,1))}\vec{\d\mat{\alpha}_j} \\
            &\qquad\Rightarrow \H l(\mat{\alpha}_j) = -c_1^2 \Big[ \left(\Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big)\ttt_{[r]\backslash j}\ten{H}_{1,1}\right)\ttt_{[r]\backslash j + 2}^{[r]\backslash j}\Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \Big]_{((2,1))}
            \qquad{\color{gray} (p_j q_j \times p_j q_j)}
        \\
    &\d^2 l(\mat{\alpha}_j, \mat{\alpha}_l) \\
        &\overset{\makebox[0pt]{\scriptsize $j < l$}}{=} -c_1^2 \t{\vec\Bigl(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\times_j\d\mat{\alpha}_j\Bigr)}\mat{H}_{1,1}\vec\Bigl(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\times_l\d\mat{\alpha}_l\Bigr) \\
            &\qquad + c_1 (\t{(\vec{\ten{X}})} - \D b(\mat{\eta}_{y,1})) \vec\Bigl(\ten{F}_y\mlm{k\in[r]\backslash\{j,l\}}\mat{\alpha}_k\times_j\d\mat{\alpha}_j\times_l\d\mat{\alpha}_l\Bigr) \\
        &= -c_1^2 \t{\vec\biggl( \d\mat{\alpha}_j \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big)_{(j)} \biggr)} \mat{K}_{\mat{p},(j)}\mat{H}_{1,1}\t{\mat{K}_{\mat{p},(l)}} \vec\biggl( \d\mat{\alpha}_l \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big)_{(l)} \biggr) \\
            &\qquad + c_1 (\t{(\vec{\ten{X}})} - \D b(\mat{\eta}_{y,1})) \t{\mat{K}_{\mat{p},((j,l))}} \vec\biggl( (\d\mat{\alpha}_l\otimes\d\mat{\alpha}_j) \Big( \ten{F}_y\mlm{k\in[r]\backslash\{j,l\}}\mat{\alpha}_k \Big)_{((j,l))} \biggr) \\
        &= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big)_{(j)}\otimes\mat{I}_{p_j} \biggr) \mat{K}_{\mat{p},(j)}\mat{H}_{1,1}\t{\mat{K}_{\mat{p},(l)}} \biggl( \t{\Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big)_{(l)}}\otimes\mat{I}_{p_l} \biggr)\vec{\d\mat{\alpha}_l} \\
            &\qquad + c_1 (\t{(\vec{\ten{X}})} - \D b(\mat{\eta}_{y,1})) \t{\mat{K}_{\mat{p},((j,l))}} \biggl( \t{\Big( \ten{F}_y\mlm{k\in[r]\backslash\{j,l\}}\mat{\alpha}_k \Big)_{((j,l))}}\otimes\mat{I}_{p_j p_l} \biggr) \vec{(\d\mat{\alpha}_l\otimes\d\mat{\alpha}_j)} \\
        &= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{((2,1))} \vec{\d\mat{\alpha}_l} \\
            &\qquad + c_1 \vec\biggl( (\ten{X} - \ten{D}_1) \ttt_{[r]\backslash\{j,l\}} \Big( \ten{F}_y\mlm{k\neq j,l}\mat{\alpha}_k \Big) \biggr) \vec{(\d\mat{\alpha}_l\otimes\d\mat{\alpha}_j)} \\
        &= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{((2,1))} \vec{\d\mat{\alpha}_l} \\
            &\qquad + c_1 \t{(\vec{\d\mat{\alpha}_j})} \biggl( (\ten{X} - \ten{D}_1) \ttt_{[r]\backslash\{j,l\}} \Big( \ten{F}_y\mlm{k\neq j,l}\mat{\alpha}_k \Big) \biggr)_{((1,3))} \vec{\d\mat{\alpha}_l} \\
                &\qquad \begin{aligned}
                    \Rightarrow \frac{\partial l}{\partial(\vec{\mat{\alpha}_j})\t{\partial(\vec{\mat{\alpha}_l})}} &=
                            -c_1^2 \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{((2,1))} \\
                        &\qquad + c_1 \biggl( (\ten{X} - \ten{D}_1) \ttt_{[r]\backslash\{j,l\}} \Big( \ten{F}_y\mlm{k\neq j,l}\mat{\alpha}_k \Big) \biggr)_{((1,3) + [[j > l]])}
                        \qquad{\color{gray} (p_j q_j \times p_l q_l)}
                \end{aligned}
        \\
    &\d^2 l(\mat{\alpha}_j, \mat{\Omega}_l) \\
        &= -c_1 c_2 \t{\vec\Bigl(\ten{F}_y\mlm{k\neq j}\mat{\alpha}_k\times_j\d\mat{\alpha}_j\Bigr)} \mat{H}_{1,2} \vec\Bigl(\bigkron{k = r}{l + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_l\otimes\bigkron{k=l-1}{1}\mat{\Omega}_k\Bigr) \\
        &= -c_1 c_2 \t{\vec\biggl(\d\mat{\alpha}_j\Big(\ten{F}_y\mlm{k\neq j}\mat{\alpha}_k\Big)_{(j)}\biggr)}\mat{K}_{\mat{p},(j)} \t{(\ten{H}_{2,1})_{([2r])}} \vec\Bigl(\bigkron{k = r}{l + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_l\otimes\bigkron{k=l-1}{1}\mat{\Omega}_k\Bigr) \\
        &= -c_1 c_2 \t{(\vec{\d\mat{\alpha}_j})}\biggl(\t{\Big(\ten{F}_y\mlm{k\neq j}\mat{\alpha}_k\Big)_{(j)}}\otimes\mat{I}_{p_j}\biggr) \mat{K}_{\mat{p},(j)} \vec\Bigl(\ten{R}_{[2r]}(\ten{H}_{2,1})\mlm{k\neq l}\t{(\vec{\mat{\Omega}_k})}\times_l\t{(\vec{\d\mat{\Omega}_l})}\Bigr) \\
        &= -c_1 c_2 \t{(\vec{\d\mat{\alpha}_j})}\biggl(\t{\Big(\ten{F}_y\mlm{k\neq j}\mat{\alpha}_k\Big)_{(j)}}\otimes\mat{I}_{p_j}\biggr) \mat{K}_{\mat{p},(j)} \t{\Bigl(\ten{R}_{[2r]}(\ten{H}_{2,1})\mlm{k\neq l}\t{(\vec{\mat{\Omega}_k})}\Bigr)_{([r])}}\vec{\d\mat{\Omega}_l} \\
        &= -c_1 c_2 \t{(\vec{\d\mat{\alpha}_j})}\biggl( \Big(\ten{F}_y\mlm{k\neq j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j}^{[r]\backslash j + r} \Bigl(\ten{R}_{[2r]}(\ten{H}_{2,1})\mlm{k\neq l}\t{(\vec{\mat{\Omega}_k})}\Bigr) \biggr)_{(r + 2, 1)} \vec{\d\mat{\Omega}_l} \\
            &\qquad\Rightarrow \frac{\partial l}{\partial(\vec{\mat{\alpha}_j})\t{\partial(\vec{\mat{\Omega}_l})}} = -c_1 c_2 \biggl( \Big(\ten{F}_y\mlm{k\neq j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j}^{[r]\backslash j + r} \Bigl(\ten{R}_{[2r]}(\ten{H}_{2,1})\mlm{k\neq l}\t{(\vec{\mat{\Omega}_k})}\Bigr) \biggr)_{(r + 2, 1)}\mat{D}_{p_l}\t{\mat{D}_{p_l}}
            % \qquad {\color{gray} (p_j q_j \times p_l^2)}
        \\
    &\d^2 l(\mat{\Omega}_j) \\
        &= -c_2^2 \t{\vec\Bigl(\bigkron{k = r}{l + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_l\otimes\bigkron{k=l-1}{1}\mat{\Omega}_k\Bigr)} \t{(\ten{H}_{2,2})_{([2r],[2r]+2r)}} \vec\Bigl(\bigkron{k = r}{l + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_l\otimes\bigkron{k=l-1}{1}\mat{\Omega}_k\Bigr) \\
        &= -c_2^2 \ten{R}_{[2r],[2r]+2r}(\ten{H}_{2,2})\mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})}\mlm{\substack{k + r\\k\in[r]\backslash j}}\t{(\vec{\mat{\Omega}_k})}\times_j\t{(\vec{\d\mat{\Omega}_j})}\times_{j + r}\t{(\vec{\d\mat{\Omega}_j})} \\
        &= -c_2^2 \t{(\vec{\d\mat{\Omega}_j})} \biggl( \ten{R}_{[2r],[2r]+2r}(\ten{H}_{2,2})\mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})}\mlm{\substack{k + r\\k\in[r]\backslash j}}\t{(\vec{\mat{\Omega}_k})} \biggr)_{([r])} \vec{\d\mat{\Omega}_j} \\
            &\qquad\Rightarrow \H l(\mat{\Omega}_j) = -c_2^2 \mat{D}_{p_j}\t{\mat{D}_{p_j}}\biggl( \ten{R}_{[2r],[2r]+2r}(\ten{H}_{2,2})\mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})}\mlm{\substack{k + r\\k\in[r]\backslash j}}\t{(\vec{\mat{\Omega}_k})} \biggr)_{([r])}\mat{D}_{p_j}\t{\mat{D}_{p_j}}
            %\qquad {\color{gray} (p_j^2 \times p_j^2)}
        \\
    &\d^2 l(\mat{\Omega}_j, \mat{\Omega}_l) \\
        &\overset{\makebox[0pt]{\scriptsize $j < l$}}{=} c_2 \langle\ten{X}\times_{k\in[r]\backslash \{j, l\}}\mat{\Omega}_k\times_j\d\mat{\Omega}_j\times_l\d\mat{\Omega}_l, \ten{X}\rangle \\
        &\qquad - c_2^2 \t{\vec\!\Big(\bigotimes_{k = r}^{l + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_l\otimes\bigotimes_{k=l-1}^{1}\mat{\Omega}_k\Big)}\mat{H}_{2,2}\vec\!\Big(\bigotimes_{k = r}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_j\otimes\bigotimes_{k=j-1}^{1}\mat{\Omega}_k\Big) \\
        &\qquad\qquad - c_2 \D b(\mat{\eta}_{y,2})\vec\!\Big(\bigotimes_{k = r}^{l + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_{l}\otimes\bigotimes_{k = l - 1}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_{j}\otimes\bigotimes_{k=j-1}^{1}\mat{\Omega}_k\Big) \\
        &= c_2 \t{(\vec{\ten{X}}\otimes\vec{\ten{X}} - (\ten{D}_2)_{([2r])})} \vec\Bigl(\bigotimes_{k = r}^{l + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_{l}\otimes\bigotimes_{k = l - 1}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_{j}\otimes\bigotimes_{k=j-1}^{1}\mat{\Omega}_k\Bigr) \\
        &\qquad - c_2^2 \t{\vec\!\Big(\bigotimes_{k = r}^{l + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_l\otimes\bigotimes_{k=l-1}^{1}\mat{\Omega}_k\Big)}\t{(\ten{H}_{2,2})_{([2r],[2r]+2r)}}\vec\!\Big(\bigotimes_{k = r}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_j\otimes\bigotimes_{k=j-1}^{1}\mat{\Omega}_k\Big) \\
        &= c_2 (\ten{X}\otimes\ten{X} - \ten{R}_{[2r]}(\ten{D}_2)) \mlm{k\neq j,l}\t{(\vec{\mat{\Omega}_k})} \times_j \t{(\vec{\d\mat{\Omega}_j})} \times_l \t{(\vec{\d\mat{\Omega}_l})} \\
        &\qquad - c_2^2 \ten{R}_{([2r],[2r]+2r)}(\ten{H}_{2,2}) \mlm{k\in [r]\backslash j}\t{(\vec{\mat{\Omega}_k})} \mlm{\substack{k + r \\ k\in [r]\backslash l}}\t{(\vec{\mat{\Omega}_k})} \times_j \t{(\vec{\d\mat{\Omega}_j})} \times_l \t{(\vec{\d\mat{\Omega}_l})} \\
        &= c_2 \t{(\vec{\d\mat{\Omega}_j})}\Big((\ten{X}\otimes\ten{X} - \ten{R}_{[2r]}(\ten{D}_2)) \mlm{k\neq j,l}\t{(\vec{\mat{\Omega}_k})} \Big)_{(j)}\vec{\d\mat{\Omega}_l} \\
        &\qquad - c_2^2 \t{(\vec{\d\mat{\Omega}_j})}\Big(\ten{R}_{([2r],[2r]+2r)}(\ten{H}_{2,2}) \mlm{k\in [r]\backslash j}\t{(\vec{\mat{\Omega}_k})} \mlm{\substack{k + r \\ k\in [r]\backslash l}}\t{(\vec{\mat{\Omega}_k})}\Big)_{(j)}\vec{\d\mat{\Omega}_l} \\
            &\qquad \begin{aligned}\Rightarrow \frac{\partial l}{\partial(\vec{\mat{\Omega}_j})\t{\partial(\vec{\mat{\Omega}_l})}} &=
                \mat{D}_{p_j}\t{\mat{D}_{p_j}}\Big[c_2\Big((\ten{X}\otimes\ten{X} - \ten{R}_{[2r]}(\ten{D}_2)) \mlm{k\neq j,l}\t{(\vec{\mat{\Omega}_k})} \Big)_{(j)} \\
                    &\qquad -c_2^2 \Big(\ten{R}_{([2r],[2r]+2r)}(\ten{H}_{2,2}) \mlm{k\in [r]\backslash j}\t{(\vec{\mat{\Omega}_k})} \mlm{\substack{k + r \\ k\in [r]\backslash l}}\t{(\vec{\mat{\Omega}_k})}\Big)_{(j)}\Big]\mat{D}_{p_l}\t{\mat{D}_{p_l}}
        % \qquad {\color{gray} (p_j^2 \times p_l^2)}
        \end{aligned}
\end{align*}}%
This now concludes the computation of the observed information matrix $\mathcal{J}(\mat{\Theta})$ as all blocks (or its transposed counterpart) are computed. To get to the Fisher information we now continue by computing the expectation of the individual blocks $\mathcal{I}_{i,j} = \E[\mathcal{J}_{i,j} \mid \ten{Y} = y]$. Taking a closer look at the individual blocks shows that only a few of them are random as only $\frac{\partial l}{\partial(\vec{\mat{\alpha}_j})\t{\partial(\vec{\mat{\alpha}_l})}}$ and $\frac{\partial l}{\partial(\vec{\mat{\Omega}_j})\t{\partial(\vec{\mat{\Omega}_l})}}$ for $j \neq l$ depend on $\ten{X}$. This means only for those the expectation needs to be calculated because all other blocks are constant with regard to the conditional expectation. Therefore, we start by compute the expectation of some subexpressions encountered in these blocks.
\begin{displaymath}
    \E[\ten{X}\mid\ten{Y} = y] \equiv \vec{\E[\ten{X}\mid\ten{Y} = y]} = \E[\mat{t}_1(\ten{X})\mid\ten{Y} = y] = \t{\D b(\mat{\eta}_{y,1})} \equiv \ten{D}_1.
\end{displaymath}
From this we get $\E[\ten{X}\mid\ten{Y} = y] = \ten{D}_1$ because there shapes are identical. Next considure
\begin{displaymath}
    \ten{X}\otimes\ten{X}
        = \ten{R}_{[2r]}(\reshape{(\mat{p}, \mat{p})}(\vec{\ten{X}}\otimes\vec{\ten{X}}))
        = \ten{R}_{[2r]}(\reshape{(\mat{p}, \mat{p})}(\mat{t}_2(\ten{X}))
\end{displaymath}
By the linearity of the expectation, rearranging and reshaping we get
\begin{displaymath}
    \E[\ten{X}\otimes\ten{X} \mid \ten{Y} = y]
        = \ten{R}_{[2r]}(\reshape{(\mat{p}, \mat{p})}(\E[\mat{t}_2(\ten{X})\mid\ten{Y} = y])
        = \ten{R}_{[2r]}(\reshape{(\mat{p}, \mat{p})}(\D b(\mat{\eta}_{y,2}))
        = \ten{R}_{[2r]}(\ten{D}_2)
\end{displaymath}
and with that all auxiliary calculation are done. Using the two expectations yields the blocks of the Fisher information
\begin{displaymath}
    \mathcal{I}(\mat{\Theta}) = \E[\mathcal{J}(\mat{\Theta})\mid\ten{Y} = y] = \begin{pmatrix}
        \mathcal{I}_{1,1} &\mathcal{I}_{2,1} & \cdots & \mathcal{I}_{2 r + 1, 1} \\
        \vdots & \vdots & \ddots & \vdots \\
        \mathcal{I}_{2 r + 1,1} & \mathcal{I}_{2 r + 1,1} & \cdots & \mathcal{I}_{2 r + 1, 2 r + 1}
    \end{pmatrix}
\end{displaymath}
and for every block holds $\mathcal{I}_{j, l} = \t{\mathcal{I}_{l, j}}$. The individual blocks are given with $j = 1, ..., r$ and $j \leq l \leq r$ by
\begin{align*}
    \mathcal{I}_{1,1} &= c_1^2 (\ten{H}_{1,1})_{([r])} \\
    \mathcal{I}_{1,j+1} % = \E\partial_{\vec{\overline{\ten{\eta}}_1}}\partial_{\t{(\vec{\mat{\alpha}_j})}} l(\mat{\Theta})\mid \ten{Y} = y
        &= c_1^2 \Big[\Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1}\Big]_{((2, 1))} \\
    \mathcal{I}_{1,j+r+1}
        &= c_1 c_2 \Big( \ten{R}_{[2r]}(\ten{H}_{2,1}) \mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})} \Big)_{(j)} \\
    \mathcal{I}_{j+1,l+1}
        &= c_1^2 \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{((2,1))} \\
    \mathcal{I}_{j+1,l+r+1}
        &= c_1 c_2 \biggl( \Big(\ten{F}_y\mlm{k\neq j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j}^{[r]\backslash j + r} \Bigl(\ten{R}_{[2r]}(\ten{H}_{2,1})\mlm{k\neq l}\t{(\vec{\mat{\Omega}_k})}\Bigr) \biggr)_{((r + 2, 1))} \\
    \mathcal{I}_{j+r+1,l+r+1}
        &= c_2^2 \Big(\ten{R}_{([2r],[2r]+2r)}(\ten{H}_{2,2}) \mlm{k\in [r]\backslash j}\t{(\vec{\mat{\Omega}_k})} \mlm{\substack{k + r \\ k\in [r]\backslash l}}\t{(\vec{\mat{\Omega}_k})}\Big)_{(j)}
\end{align*}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
                  \section{Vectorization and Matricization}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The \emph{matricization} is a generalization of the \emph{vectorization} operation, the \emph{mode product} generalizes the matrix matrix product which is used in the \emph{multi linar multiplication}. In this section we provide some relations between these operations in conjunction with the Kronecker product.

\begin{theorem}\label{thm:mlm_mat}
    Let $\ten{A}$ be a tensor of order $r$ with the dimensions $q_1\times ... \times q_r$. Furthermore, let for $k = 1, ..., r$ be $\mat{B}_k$ matrices of dimensions $p_k\times q_k$. Then, for any $(\mat{i}, \mat{j})\in\perm{r}$ holds
    \begin{displaymath}
        \Big(\ten{A}\mlm{k\in[r]}\mat{B}_k\Big)_{(\mat{i}, \mat{j})}
            = \Big(\bigotimes_{k = \len{\mat{i}}}^{1}\mat{B}_{\mat{i}_k}\Big) \ten{A}_{(\mat{i}, \mat{j})} \Big(\bigotimes_{k = \len{\mat{j}}}^{1}\t{\mat{B}_{\mat{j}_k}}\Big).
    \end{displaymath}
\end{theorem}

A well known special case of Theorem~\ref{thm:mlm_mat} is the relation between vectorization and the Kronecker product
\begin{displaymath}
    \vec(\mat{B}_1\mat{A}\t{\mat{B}_2}) = (\mat{B}_2\otimes\mat{B}_1)\vec{A}.
\end{displaymath}
Here we have a matrix, a.k.a. an order 2 tensor, and the vectorization as a special case of the matricization $\vec{\mat{A}} = \mat{A}_{((1, 2))}$ with $(\mat{i}, \mat{j}) = ((1, 2), ())\in\perm{2}$. Note that the empty Kronecker product is $1$ by convention.

Before we state another identity of interest which arises in matrix calculus we define a new operation similar in nature to the rearrangement operation in \cite{ApproxKron-VanLoanPitsianis1993}.
\begin{defn}
    Let $\ten{A}$ be a $2 r + 2 s + t$ tensor where $r > 0$ and $s, t \geq 0$ of dimensions $q_1\times ... \times q_{2 r + 2 s + t}$. Furthermore, let $(\mat{i}, \mat{j}, \mat{k})\in\perm{2 r + 2 s + t}$ such that $\len{\mat{i}} = 2 r$ and $\len{\mat{j}} = 2 s$ with which the operation $\ten{R}_{\mat{i}, \mat{j}}$ is defined as
    \begin{displaymath}
        \ten{R}_{\mat{i}, \mat{j}}(\ten{A}) = \reshape{\mat{d}}(\ten{A}_{((\pi(\mat{i}), \rho(\mat{j})))})
    \end{displaymath}
    where both $\pi, \rho$ are \emph{inverse perfect outer shuffles} and the new dimension $\mat{d}$ is given by
    \begin{displaymath}
        \mat{d} = (q_{\mat{i}_1} q_{\mat{i}_{r + 1}}, ..., q_{\mat{i}_{r}} q_{\mat{i}_{2 r}}, q_{\mat{j}_1} q_{\mat{j}_{s + 1}}, ..., q_{\mat{j}_s} q_{\mat{j}_{2 s}}, q_{2 r + 2 s + 1}, ..., q_{2 r + 2 s + t}).
    \end{displaymath}
    In the case of $s = 0$ we drop the second index and write $\ten{R}_{\mat{i}}$ cause $\mat{j}$ is empty.
\end{defn}
The operation $\ten{R}_{\mat{i}}(\ten{A})$ results in a tensor of order $r + s$ as the reshape operation collapses pairs of consecutive axis after shuffling. Similar the two index version leads to a tensor of order $r + s + t$.

\begin{center}
    \begin{tikzpicture}[>=latex]
        \foreach \t in {1,...,13} {
            \node at (\t, 2) {$q_{\t}$};
        }
        % \foreach \card [count=\y, evaluate=\y as \h using {\y / 3.5}] in {4,...,2,A} {
        \foreach \t [count=\f] in {1, 3, 5, 2, 4, 6} {
            \node[label=above:$\mat{i}_\f$] (f) at (\f,  1) {$\f$};
            \node[label=below:$\pi(\mat{i})_\t$] (t) at (\t, -1) {$\f$};
            \draw[->] (f) to[out=270, in=90] (t);
        }
        \begin{scope}[xshift=6cm]
            \foreach \t [count=\f, evaluate=\f as \ff using {int(\f + 6)}] in {1, 3, 2, 4} {
                \node[label=above:$\mat{j}_\f$] (f) at (\f,  1) {$\ff$};
                \node[label=below:$\sigma(\mat{j})_\t$] (t) at (\t, -1) {$\ff$};
                \draw[->] (f) to[out=270, in=90] (t);
            }
        \end{scope}
        \begin{scope}[xshift=10cm]
            \foreach \t [count=\f, evaluate=\f as \ff using {int(\f + 10)}] in {1, 2, 3} {
                \node[label=above:$\mat{k}_\f$] (f) at (\f,  1) {$\ff$};
                \node[label=below:$\mat{k}_\t$] (t) at (\t, -1) {$\ff$};
                \draw[->] (f) to[out=270, in=90] (t);
            }
        \end{scope}
        \foreach \t [count=\f] in {1,4,2,5,3,6,7,9,8,10,11,12,13} {
            \node (d-\f) at (\f, -2) {$q_{\t}$};
        }
        \foreach \l\r [count=\f] in {1/4,2/5,3/6,7/9,8/10} {
            \node[label=below:$d_{\f}$] (e-\f) at ({2 * \f - 0.5}, -4) {$q_{\l} q_{\r}$};
        }
        \foreach \t [count=\f from 6] in {11,12,13} {
            \node[label=below:$d_{\f}$] (e-\t) at (\t, -4) {$q_{\t}$};
        }
        \draw[->] (d-1) to[out=270, in=90] (e-1); \draw (d-2) to[out=270, in=90] (e-1);
        \draw[->] (d-3) to[out=270, in=90] (e-2); \draw (d-4) to[out=270, in=90] (e-2);
        \draw[->] (d-5) to[out=270, in=90] (e-3); \draw (d-6) to[out=270, in=90] (e-3);
        \draw[->] (d-7) to[out=270, in=90] (e-4); \draw (d-8) to[out=270, in=90] (e-4);
        \draw[->] (d-9) to[out=270, in=90] (e-5); \draw (d-10) to[out=270, in=90] (e-5);
        \draw[->] (d-11) -- (e-11);
        \draw[->] (d-12) -- (e-12);
        \draw[->] (d-13) -- (e-13);

        \node[anchor=east] at (0.5, 2) {Dims. $\ten{A}$};
        \node[anchor=east] at (0.5, 1) {Modes};
        \node[anchor=east] at (0.5, -1) {Perm. Modes};
        \node[anchor=east] at (0.5, -2) {Perm. Dims.};
        \node[anchor=east] at (0.5, -4) {Dims. $\ten{R}_{\mat{i}, \mat{j}}(\ten{A})$};
    \end{tikzpicture}
\end{center}

\begin{theorem}\label{thm:mtvk_rearrange}
    Let $\ten{A}$ be a $2 r + s$ tensor where $r > 0$ and $s \geq 0$ of dimensions $q_1\times ... \times q_{2 r + s}$. Furthermore, let $(\mat{i}, \mat{j})\in\perm{2 r + s}$ such that $\len{\mat{i}} = 2 r$ and for $k = 1, ..., r$ denote with $\mat{B}_k$ matrices of dimensions $q_{\mat{i}_{k}}\times q_{\mat{i}_{r + k}}$, then
    \begin{displaymath}
        \t{\ten{A}_{(\mat{i})}}\vec{\bigotimes_{k = r}^{1}}\mat{B}_k
            \equiv \ten{R}_{\mat{i}}(\ten{A})\times_{k\in[r]}\t{(\vec{\mat{B}_k})}.
    \end{displaymath}
\end{theorem}

\todo{continue}


% Next we define a specific axis permutation and reshaping operation on tensors which will be helpful in expressing some derivatives. Let $\ten{A}$ be a $2 r + s$ tensor with $r > 0$ and $s\geq 0$ of dimensions $p_1\times ... \times p_{2 r + s}$. Furthermore, let $(\mat{i}, \mat{j})\in\perm{2 r + s}$ such that $\len{\mat{i}} = 2 r$. The operation $\ten{R}_{\mat{i}}$ is defined as
% \begin{displaymath}
%     \ten{R}_{\mat{i}} = \reshape{(p_1 p_{r + 1}, ..., p_r p_{2 r}, p_{2 r + 1}, ..., p_{r + s})}(\ten{A}_{(\pi(\mat{i}))})
% \end{displaymath}

% The operation $\ten{R}_{\mat{i}}$ is now defined in two steps. First, the axis of $\ten{A}$ are permuted by $(\pi(\mat{i}), \mat{j})$ where $\pi(\mat{i})$ is the perfect outer shuffle of $\mat{i}$. Second, the first $2 r$ axis are collapsed into $r$ axis via reshaping by vectorizing consecutive axis pairs. \todo{Explane, define (understandable)!} Using the operation $\ten{R}_{\mat{i}}$ we get the relation
% \begin{displaymath}
%     \t{\ten{A}_{(\mat{i})}}\vec\bigotimes_{k = r}^{1}\mat{B}_k \equiv \ten{R}_{\mat{i}}(\ten{A})\times_{k\in[r]}\vec{\mat{B}_k}.
% \end{displaymath}


% Even more general, let $(\mat{i}, \mat{j})\in\perm{r}$ and $\ten{A}$ an order $r$ tensor, then
% \begin{displaymath}
%     (\ten{A}\times_{k\in[r]}\mat{B}_k)_{(\mat{i}, \mat{j})} = \Big(\bigotimes_{k = \len{\mat{i}}}^{1}\mat{B}_{\mat{i}_k}\Big)
%         \ten{A}_{(\mat{i})}\Big(\bigotimes_{k = \len{\mat{j}}}^{1}\t{\mat{B}_{\mat{j}_k}}\Big).
% \end{displaymath}

% \begin{displaymath}
%     \vec(\ten{A}\times_{k\in[r]}\mat{B}_k) = \Big(\bigotimes_{k = r}^1 \mat{B}_k\Big)\vec\ten{A}
% \end{displaymath}
% \begin{displaymath}
%     (\ten{A}\times_{k\in[r]}\mat{B}_k)_{(j)} = \mat{B}_j\ten{A}_{(j)}\bigotimes_{\substack{k = r\\k\neq j}}^1\t{\mat{B}_k}
% \end{displaymath}
% of which a special case is $(\ten{A}\times_{j}\mat{B}_j)_{(j)} = \mat{B}_j\ten{A}_{(j)}$.

% Even more general, let $(\mat{i}, \mat{j})\in\perm{r}$ and $\ten{A}$ an order $r$ tensor, then
% \begin{displaymath}
%     (\ten{A}\times_{k\in[r]}\mat{B}_k)_{(\mat{i}, \mat{j})} = \Big(\bigotimes_{k = \len{\mat{i}}}^{1}\mat{B}_{\mat{i}_k}\Big)
%         \ten{A}_{(\mat{i})}\Big(\bigotimes_{k = \len{\mat{j}}}^{1}\t{\mat{B}_{\mat{j}_k}}\Big).
% \end{displaymath}
% For example, let $\ten{A}$ be of order $6$ and $\mat{i} = (1, 3, 2, 4)$, $\mat{j} = (5, 6)$, then
% \begin{displaymath}
%     (\ten{A}\times_{k\in[r]}\mat{B}_k)_{((1, 3, 2, 4), (5, 6))} = (\mat{B}_4\otimes\mat{B}_2\otimes\mat{B}_3\otimes\mat{B}_1)\ten{A}_{((1, 3, 2, 4), (5, 6))}(\t{\mat{B}_6}\otimes\t{\mat{B}_5}).
% \end{displaymath}
% where $\ttt$ is the tensor times tensor product.

% Another useful identity for two tensors $\ten{A}, \ten{B}$ of dimensions $p_1\times ... \times p_{j-1}\times q_j\times p_{j+1}\times p_r$ and $p_1\times ... \times p_{k-1}\times q_k\times p_{k+1}\times p_r$, respectively. Furthermore, we have an order $2 r$ tensor $\ten{H}$ of dimensions $(\mat{p}, \mat{p})$ and two matrices $\mat{C}, \mat{D}$ of dimensions $p_j\times q_j$ and $p_k\times q_k$, then
% \begin{align*}
%     \t{\vec(\ten{A}\times_j\mat{C})}&\ten{H}_{([r])}\vec(\ten{B}\times_k\mat{D}) \\
%         &= \t{(\vec{\mat{C}})}((\ten{H}\ttt_{[r]\backslash j}\ten{A})\ttt_{([r]\backslash k) + 1, [r]\backslash k}\ten{B})_{(1, 3)}\vec{\mat{D}} \\
%         &= \t{(\vec{\mat{C}})}((\ten{H}\ttt_{([r]\backslash k) + r, [r]\backslash k}\ten{B})\ttt_{[r]\backslash k}\ten{A})_{(1, 4)}\vec{\mat{D}} \\
%         &= \t{(\vec{\mat{C}})}  \reshape{(p_j q_j, p_k q_k)}(\reshape{(p_j, \mat{p}, q_j)}(\ten{H}_{(j, [r] + r)}\t{\ten{A}_{(j)}})_{(1, r + 2, k + 1)}\t{\ten{B}_{(k)}})  \vec{\mat{D}}
% \end{align*}

% Let $\ten{A}$ be an order $r$ tensor and $\mat{B}$ of dimensions $p_j\times q$ a matching matrix such that $\ten{A}\times_j\mat{B}$ is of dimensions $p_1\times...\times p_r$. Furthermore, let $\ten{H}$ be a tensor of dimensions $p_1\times...\times p_r\times d_1\times ... \times d_s$ which makes it an order $r + s$ tensor, then
% \begin{displaymath}
%     \t{\vec(\ten{A}\times_j\mat{B})}\ten{H}_{([r])} = \t{(\vec{\mat{B}})}(\ten{H}\ttt_{[r]\backslash j}\ten{A})_{(1, s + 2)}.
% \end{displaymath}

% Next we define a specific axis permutation and reshaping operation on tensors which will be helpful in expressing some derivatives. Let $\ten{A}$ be a $2 r + s$ tensor $\ten{A}$ with $r > 0$ and $s\geq 0$ of dimensions $p_1\times ... \times p_{2 r + s}$. Furthermore, let $(\mat{i}, \mat{j})\in\perm{2 r + s}$ such that $\len{\mat{i}} = 2 r$, $\len{\mat{j}} = s$ and $\mat{j}$ also sorted. The operation $\ten{R}_{\mat{i}}$ is now defined in two steps. First, the axis of $\ten{A}$ are permuted by $(\pi(\mat{i}), \mat{j})$ where $\pi(\mat{i})$ is the perfect outer shuffle of $\mat{i}$. Second, the first $2 r$ axis are collapsed into $r$ axis via reshaping by vectorizing consecutive axis pairs. \todo{Explane, define (understandable)!} Using the operation $\ten{R}_{\mat{i}}$ we get the relation
% \begin{displaymath}
%     \t{\ten{A}_{(\mat{i})}}\vec\bigotimes_{k = r}^{1}\mat{B}_k \equiv \ten{R}_{\mat{i}}(\ten{A})\times_{k\in[r]}\vec{\mat{B}_k}.
% \end{displaymath}

\begin{center}
    \begin{tikzpicture}[>=latex, x=1cm, y=-1cm, z=0.5cm, scale = 1.3]
        \tikzstyle{axis} = [
            ->,
            color = gray,
            shorten <=3pt
        ];
        \tikzstyle{number} = [
            pos = 0.5,
            text = black,
            fill = white
        ];

        \begin{scope}[xshift=0cm]
            \node[anchor=south] at (1, -0.5) {$\ten{A}_{([4])}$};
            \node[anchor=north, text=gray] at (1, -0.5) {\footnotesize Matrix};

            \draw[axis] (0, 0) -- +(0, 1) node[number] {$1$};
            \draw[axis] (0, 1) -- +(0, 1) node[number] {$2$};
            \draw[axis] (0, 2) -- +(0, 1) node[number] {$3$};
            \draw[axis] (0, 3) -- +(0, 1) node[number] {$4$};
            \draw[axis] (0, 0) -- +(1, 0) node[number] {$5$};
            \draw[axis] (1, 0) -- +(1, 0) node[number] {$6$};

            \draw (.1, .1) rectangle (1.9, 3.9);

            \begin{scope}[xshift=-.5cm, yshift=-8cm]
                \node[anchor=south] at (1.5, -1.5) {$\ten{R}_{([4])}(\ten{A})$};
                \node[anchor=north, text=gray] at (1.5, -1.5) {\footnotesize 3D Tensor};

                \draw[axis] (0, 0, 0) -- (0, 1, 0) node[number] {$1$};
                \draw[axis] (0, 1, 0) -- (0, 2, 0) node[number] {$3$};
                \draw[axis] (0, 0, 0) -- (1, 0, 0) node[number] {$2$};
                \draw[axis] (1, 0, 0) -- (2, 0, 0) node[number] {$4$};
                \draw[axis] (0, 0, 0) -- (0, 0, 1) node[number] {$5$};
                \draw[axis] (0, 0, 1) -- (0, 0, 2) node[number] {$6$};

                \draw (.1, .1) rectangle (1.9, 1.9);
                \draw (.1, .1) -- ++(0, 0, 1.8) -- ++(1.8, 0) -- (1.9, .1);
                \draw (1.9, .1, 1.8) -- ++(0, 1.8) -- (1.9, 1.9);
            \end{scope}
        \end{scope}
        \begin{scope}[xshift=3cm]
            \node[anchor=south] at (1, -0.5) {$\ten{A}_{([6])}$};
            \node[anchor=north, text=gray] at (1, -0.5) {\footnotesize Vector};

            \draw[axis] (0, 0) -- +(0, 1) node[number] {$1$};
            \draw[axis] (0, 1) -- +(0, 1) node[number] {$2$};
            \draw[axis] (0, 2) -- +(0, 1) node[number] {$3$};
            \draw[axis] (0, 3) -- +(0, 1) node[number] {$4$};
            \draw[axis] (0, 4) -- +(0, 1) node[number] {$5$};
            \draw[axis] (0, 5) -- +(0, 1) node[number] {$6$};

            \draw (.1, .1) rectangle (.3, 5.9);

            \begin{scope}[xshift=-.5cm, yshift=-8cm]
                \node[anchor=south] at (1.5, -1.5) {$\ten{R}_{([6])}(\ten{A})$};
                \node[anchor=north, text=gray] at (1.5, -1.5) {\footnotesize 3D Tensor};

                \draw[axis] (0, 0, 0) -- (0, 1, 0) node[number] {$1$};
                \draw[axis] (0, 1, 0) -- (0, 2, 0) node[number] {$4$};
                \draw[axis] (0, 0, 0) -- (1, 0, 0) node[number] {$2$};
                \draw[axis] (1, 0, 0) -- (2, 0, 0) node[number] {$5$};
                \draw[axis] (0, 0, 0) -- (0, 0, 1) node[number] {$3$};
                \draw[axis] (0, 0, 1) -- (0, 0, 2) node[number] {$6$};

                \draw (.1, .1) rectangle (1.9, 1.9);
                \draw (.1, .1) -- ++(0, 0, 1.8) -- ++(1.8, 0) -- (1.9, .1);
                \draw (1.9, .1, 1.8) -- ++(0, 1.8) -- (1.9, 1.9);
            \end{scope}
        \end{scope}
        \begin{scope}[xshift=6cm]
            \node[anchor=south] at (1, -0.5) {$\ten{A}_{((1, 5, 2, 6))}$};
            \node[anchor=north, text=gray] at (1, -0.5) {\footnotesize Matrix};

            \draw[axis] (0, 0) -- +(0, 1) node[number] {$1$};
            \draw[axis] (0, 1) -- +(0, 1) node[number] {$5$};
            \draw[axis] (0, 2) -- +(0, 1) node[number] {$2$};
            \draw[axis] (0, 3) -- +(0, 1) node[number] {$6$};
            \draw[axis] (0, 0) -- +(1, 0) node[number] {$3$};
            \draw[axis] (1, 0) -- +(1, 0) node[number] {$4$};

            \draw (.1, .1) rectangle (1.9, 3.9);

            \begin{scope}[xshift=-.5cm, yshift=-8cm]
                \node[anchor=south] at (1.5, -1.5) {$\ten{R}_{((1, 5, 2, 6))}(\ten{A})$};
                \node[anchor=north, text=gray] at (1.5, -1.5) {\footnotesize 3D Tensor};

                \draw[axis] (0, 0, 0) -- (0, 1, 0) node[number] {$1$};
                \draw[axis] (0, 1, 0) -- (0, 2, 0) node[number] {$2$};
                \draw[axis] (0, 0, 0) -- (1, 0, 0) node[number] {$5$};
                \draw[axis] (1, 0, 0) -- (2, 0, 0) node[number] {$6$};
                \draw[axis] (0, 0, 0) -- (0, 0, 1) node[number] {$3$};
                \draw[axis] (0, 0, 1) -- (0, 0, 2) node[number] {$4$};

                \draw (.1, .1) rectangle (1.9, 1.9);
                \draw (.1, .1) -- ++(0, 0, 1.8) -- ++(1.8, 0) -- (1.9, .1);
                \draw (1.9, .1, 1.8) -- ++(0, 1.8) -- (1.9, 1.9);
            \end{scope}
        \end{scope}
    \end{tikzpicture}
\end{center}

\newcommand{\somedrawing} {
    \coordinate (a) at (-2,-2,-2);
    \coordinate (b) at (-2,-2,2);
    \coordinate (c) at (-2,2,-2);
    \coordinate (d) at (-2,2,2);
    \coordinate (e) at (2,-2,-2);
    \coordinate (f) at (2,-2,2);
    \coordinate (g) at (2,2,-2);
    \coordinate (h) at (2,2,2);
    \draw (a)--(b) (a)--(c) (a)--(e) (b)--(d) (b)--(f) (c)--(d) (c)--(g) (d)--(h) (e)--(f) (e)--(g) (f)--(h) (g)--(h);
    \fill (a) circle (0.1cm);
    \fill (d) ++(0.1cm,0.1cm) rectangle ++(-0.2cm,-0.2cm);
}

% \begin{figure}[ht!]
%     \centering
%     \begin{tikzpicture}[scale = 2, x = 1cm, y = 1cm, z = 0.5cm]
%         \begin{scope}
%             \foreach \y in {8,...,5} {
%                 \draw[canvas is xz plane at y={-\y / 5}, fill = gray!30]
%                 (0, 0) node[anchor = east] {$\y$} rectangle (1, 1)
%                 node[pos = 0.5, transform shape, scale = 2] {$\y$};
%             }
%             \foreach \y in {4,...,1} {
%                 \draw[canvas is xz plane at y={-\y / 5}, fill = white]
%                 (0, 0) node[anchor = east] {$\y$} rectangle (1, 1)
%                 node[pos = 0.5, transform shape, scale = 2] {$\y$};
%             }
%         \end{scope}

%         \begin{scope}[xshift = 3cm]
%             \draw[canvas is xz plane at y={-8 / 5}, fill = gray!30] (0, 0) node[anchor = east] {$8$} rectangle (1, 1) node[pos = 0.5, transform shape, scale = 2] {$8$};
%             \draw[canvas is xz plane at y={-7 / 5}, fill = white]   (0, 0) node[anchor = east] {$4$} rectangle (1, 1) node[pos = 0.5, transform shape, scale = 2] {$4$};
%             \draw[canvas is xz plane at y={-6 / 5}, fill = gray!30] (0, 0) node[anchor = east] {$7$} rectangle (1, 1) node[pos = 0.5, transform shape, scale = 2] {$7$};
%             \draw[canvas is xz plane at y={-5 / 5}, fill = white]   (0, 0) node[anchor = east] {$3$} rectangle (1, 1) node[pos = 0.5, transform shape, scale = 2] {$3$};
%             \draw[canvas is xz plane at y={-4 / 5}, fill = gray!30] (0, 0) node[anchor = east] {$6$} rectangle (1, 1) node[pos = 0.5, transform shape, scale = 2] {$6$};
%             \draw[canvas is xz plane at y={-3 / 5}, fill = white]   (0, 0) node[anchor = east] {$2$} rectangle (1, 1) node[pos = 0.5, transform shape, scale = 2] {$2$};
%             \draw[canvas is xz plane at y={-2 / 5}, fill = gray!30] (0, 0) node[anchor = east] {$5$} rectangle (1, 1) node[pos = 0.5, transform shape, scale = 2] {$5$};
%             \draw[canvas is xz plane at y={-1 / 5}, fill = white]   (0, 0) node[anchor = east] {$1$} rectangle (1, 1) node[pos = 0.5, transform shape, scale = 2] {$1$};
%         \end{scope}
%     \end{tikzpicture}
%     \caption{\label{fig:perfect_outer_shuffle}Left: sorted deck of $8$ cards, Right: perfect outer shuffle of the deck on the left.}
% \end{figure}

% Some text with $\heartsuit$, $\spadesuit$, $\diamondsuit$, $\clubsuit$ in it
\newcommand{\hearts}[1]{%
    \begin{scope}[every node/.style={transform shape, text = red}]
        \draw[rounded corners=4pt, fill = white] (1.6, 0) rectangle (0, 2)
            node[pos = 0.5, scale = 3] {#1};
        \node[scale = 1, rotate=180] at (1.4, 0.2) {$#1$};
        \node[scale = 1, rotate=180] at (1.4, 0.5) {$\heartsuit$};
        \node[scale = 1]             at (0.2, 1.8) {$#1$};
        \node[scale = 1]             at (0.2, 1.5) {$\heartsuit$};
    \end{scope}
}
\newcommand{\spades}[1]{%
    \begin{scope}[every node/.style={transform shape, text = black}]
        \draw[rounded corners=4pt, fill = white] (1.6, 0) rectangle (0, 2)
            node[pos = 0.5, scale = 3] {#1};
        \node[scale = 1, rotate=180] at (1.4, 0.2) {$#1$};
        \node[scale = 1, rotate=180] at (1.4, 0.5) {$\spadesuit$};
        \node[scale = 1]             at (0.2, 1.8) {$#1$};
        \node[scale = 1]             at (0.2, 1.5) {$\spadesuit$};
    \end{scope}
}

\begin{figure}[ht!]
    \centering
    \begin{tikzpicture}[>=latex]
        % \draw[->] (0, 0, 0) -- (3, 0, 0) node[anchor = west] {$x$};
        % \draw[->] (0, 0, 0) -- (0, 3, 0) node[anchor = west] {$y$};
        % \draw[->] (0, 0, 0) -- (0, 0, 3) node[anchor = west] {$z$};

        % \coordinate (op) at (2, 0, 0);
        % \coordinate (xp) at (1.6, 0, -2);
        % \coordinate (yp) at (3, 0, 0.5);

        % \draw[->, red] (0, 0, 0) -- (op) node[anchor = south east] {$o_{p}$};
        % \draw[->, red] (0, 0, 0) -- (xp) node[anchor = south east] {$x_{p}$};
        % \draw[->, red] (0, 0, 0) -- (yp) node[anchor = north east] {$y_{p}$};

        % \begin{scope}[canvas is plane={O(2, 0, 0)x(1.6, 0, -2)y(3, 0, 0.5)}]
        %     \draw[->, blue] (0, 0) -- (1, 0) node[anchor = south west] {$x_{i}$};
        %     \draw[->, blue] (0, 0) -- (0, 1) node[anchor = north west] {$y_{i}$};
        % \end{scope}

        \begin{scope}
            \node at ({3 / 2 - 0.2}, 6) {Identity};
            \foreach \t [count=\f] in {1, 2, 3, 4, 5, 6, 7, 8} {
                \node (from) at ({\f / 3 - 0.2}, 5.4) {$\f$};
                \node  (to)  at ({\t / 3 - 0.2}, 4.2) {$\f$};
                \draw[->, gray] (from.south) to[out=270, in=90] (to.north);
            }

            \foreach \card [count=\y, evaluate=\y as \h using {\y / 3.5}] in {4,...,2,A} {
                \begin{scope}[canvas is plane={O(0, \h, 0)x(1, \h, 0.5)y(-0.2, \h, -2)}]
                    \spades{\card};
                \end{scope}
            }
            \foreach \card [count=\y, evaluate=\y as \h using {(\y + 4) / 3.5}] in {4,...,2,A} {
                \begin{scope}[canvas is plane={O(0, \h, 0)x(1, \h, 0.5)y(-0.2, \h, -2)}]
                    \hearts{\card};
                \end{scope}
            }
        \end{scope}

        \begin{scope}[xshift = 3.5cm]
            \node at ({3 / 2 - 0.2}, 6) {Perfect Shuffle};
            \foreach \t [count=\f] in {1, 5, 2, 6, 3, 7, 4, 8} {
                \node (from) at ({\f / 3 - 0.2}, 5.4) {$\f$};
                \node  (to)  at ({\t / 3 - 0.2}, 4.2) {$\f$};
                \draw[->, gray] (from.south) to[out=270, in=90] (to.north);
            }

            \begin{scope}[canvas is plane={O(0, {1 / 3.5}, 0)x(1, {1 / 3.5}, 0.5)y(-0.2, {1 / 3.5}, -2)}]
                \spades{4};
            \end{scope}
            \begin{scope}[canvas is plane={O(0, {2 / 3.5}, 0)x(1, {2 / 3.5}, 0.5)y(-0.2, {2 / 3.5}, -2)}]
                \hearts{4};
            \end{scope}
            \begin{scope}[canvas is plane={O(0, {3 / 3.5}, 0)x(1, {3 / 3.5}, 0.5)y(-0.2, {3 / 3.5}, -2)}]
                \spades{3};
            \end{scope}
            \begin{scope}[canvas is plane={O(0, {4 / 3.5}, 0)x(1, {4 / 3.5}, 0.5)y(-0.2, {4 / 3.5}, -2)}]
                \hearts{3};
            \end{scope}
            \begin{scope}[canvas is plane={O(0, {5 / 3.5}, 0)x(1, {5 / 3.5}, 0.5)y(-0.2, {5 / 3.5}, -2)}]
                \spades{2};
            \end{scope}
            \begin{scope}[canvas is plane={O(0, {6 / 3.5}, 0)x(1, {6 / 3.5}, 0.5)y(-0.2, {6 / 3.5}, -2)}]
                \hearts{2};
            \end{scope}
            \begin{scope}[canvas is plane={O(0, {7 / 3.5}, 0)x(1, {7 / 3.5}, 0.5)y(-0.2, {7 / 3.5}, -2)}]
                \spades{A};
            \end{scope}
            \begin{scope}[canvas is plane={O(0, {8 / 3.5}, 0)x(1, {8 / 3.5}, 0.5)y(-0.2, {8 / 3.5}, -2)}]
                \hearts{A};
            \end{scope}
        \end{scope}

        \begin{scope}[xshift = 7cm]
            \node at ({3 / 2 - 0.2}, 6) {Inverse Shuffle};
            \foreach \t [count=\f] in {1, 3, 5, 7, 2, 4, 6, 8} {
                \node (from) at ({\f / 3 - 0.2}, 5.4) {$\f$};
                \node  (to)  at ({\t / 3 - 0.2}, 4.2) {$\f$};
                \draw[->, gray] (from.south) to[out=270, in=90] (to.north);
            }

            \begin{scope}[canvas is plane={O(0, {1 / 3.5}, 0)x(1, {1 / 3.5}, 0.5)y(-0.2, {1 / 3.5}, -2)}]
                \spades{4};
            \end{scope}
            \begin{scope}[canvas is plane={O(0, {2 / 3.5}, 0)x(1, {2 / 3.5}, 0.5)y(-0.2, {2 / 3.5}, -2)}]
                \spades{2};
            \end{scope}
            \begin{scope}[canvas is plane={O(0, {3 / 3.5}, 0)x(1, {3 / 3.5}, 0.5)y(-0.2, {3 / 3.5}, -2)}]
                \hearts{4};
            \end{scope}
            \begin{scope}[canvas is plane={O(0, {4 / 3.5}, 0)x(1, {4 / 3.5}, 0.5)y(-0.2, {4 / 3.5}, -2)}]
                \hearts{2};
            \end{scope}
            \begin{scope}[canvas is plane={O(0, {5 / 3.5}, 0)x(1, {5 / 3.5}, 0.5)y(-0.2, {5 / 3.5}, -2)}]
                \spades{3};
            \end{scope}
            \begin{scope}[canvas is plane={O(0, {6 / 3.5}, 0)x(1, {6 / 3.5}, 0.5)y(-0.2, {6 / 3.5}, -2)}]
                \spades{A};
            \end{scope}
            \begin{scope}[canvas is plane={O(0, {7 / 3.5}, 0)x(1, {7 / 3.5}, 0.5)y(-0.2, {7 / 3.5}, -2)}]
                \hearts{3};
            \end{scope}
            \begin{scope}[canvas is plane={O(0, {8 / 3.5}, 0)x(1, {8 / 3.5}, 0.5)y(-0.2, {8 / 3.5}, -2)}]
                \hearts{A};
            \end{scope}
        \end{scope}
    \end{tikzpicture}
    \caption{\label{fig:perfect_outer_shuffle}Left: sorted deck of $8$ cards, Center: perfect outer shuffle, Right: inverse perfect outer shuffle}
\end{figure}


% Let $\ten{A}$ be a $p_1\times ... \times p_r\times q_1\times ... \times q_r\times d_1\times ...\times d_s$ tensor and $\mat{B}_k$ be $p_k\times q_k$ matrices, then
% \begin{displaymath}
%     \t{\Big(\vec{\bigotimes_{k = r}^{1}\mat{B}_k}\Big)}\ten{A}_{([2 r])}
%         = \t{(\vec{\mat{B}_k})}(\operatorname{\mathcal{R}}(\ten{A})\times_{k\in[r]\backslash j}\t{(\vec{\mat{B}_k})})_{([2 r])}
% \end{displaymath}

% Let $\ten{A}$ be a $n\times p_1\times ... \times p_r\times q_1\times ... \times q_r$ tensor and $\mat{B}_k$ be $p_k\times q_k$ matrices, then
% \begin{displaymath}
%     \ten{A}_{(1)} \vec{\bigotimes_{k = r}^{1}\mat{B}_k}
%     =
%     \Big(\ten{R}(\ten{A})\times_{\substack{k + 1\\k\in[r]}}\t{\vec(\mat{B}_k)}\Big)_{(1)}
% \end{displaymath}
% where $\ten{R}$ is a permutation of the axis and reshaping of the tensor $\ten{A}$. This axis permutation converts $n\times p_1\times ... \times p_r\times q_1\times ... \times q_r$ to $n\times p_1\times q_1 \times ... \times p_r\times q_r$ and the reshaping vectorizes the axis pairs $p_k\times q_k$ leading to a tensor $\ten{R}(\ten{A})$ of dimensions $n\times p_1 q_1\times ...\times p_r q_r$.

% An alternative way to write this is for each of the $i\in[n]$ vector components is
% \begin{displaymath}
%     \Big(\ten{A}_{(1)}\vec{\bigotimes_{k = r}^{1}\mat{B}_k}\Big)_{i}
%         = \sum_{J\in[(\mat{p}, \mat{q})]}
%             \ten{A}_{i, J}\prod_{k = 1}^r (B_k)_{J_k, J_{k + r}}
% \end{displaymath}
% using the notation $J\in[(\mat{p}, \mat{q})] = [p_1]\times ... \times [p_r]\times [q_1]\times ... \times [q_r]$.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
                        \section{Pattern Matrices}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

The \emph{duplication matrix} $\mat{D}_p$ of dimensions $p^2\times p(p + 1) / 2$ is defined implicitly such that for any symmetric $p\times p$ matrix $\mat{A}$ holds
\begin{displaymath}
    \mat{D}_p\vech\mat{A} = \vec{\mat{A}}.
\end{displaymath}
Let $\mat{A}$ by a $p\times q$ matrix, then we denote the \emph{commutation matrix} $\mat{K}_{p,q}$ as the $p q\times p q$ matrix satisfying
\begin{displaymath}
    \mat{K}_{p,q}\vec\mat{A} = \vec{\t{\mat{A}}}.
\end{displaymath}
The identity giving the commutation matrix its name is
\begin{displaymath}
    \mat{A}\otimes\mat{B} = \mat{K}_{a_1,b_1}(\mat{B}\otimes\mat{A})\t{\mat{K}_{a_2,b_2}}.
\end{displaymath}
For a generalization of the commutation matrix let $\ten{A}$ be a $p_1\times ...\times p_r$ tensor of order $r$. Then the \emph{generalized commutation matrix} $\mat{K}_{(p_1, ..., p_r),(j)}$ is implicitly defined such that
\begin{displaymath}
    \mat{K}_{(p_1, ..., p_r),(j)}\vec{\ten{A}} = \vec{\ten{A}_{(j)}}
\end{displaymath}
for every $j \in[r]$ mode. This is a direct generalization of the commutation matrix with the special case $\mat{K}_{(p,q),(2)} = \mat{K}_{p,q}$ and the trivial case $\mat{K}_{(p_1, ..., p_r),(1)} = \mat{I}_{p}$ for $p = \prod_{j=1}^r p_j$. Furthermore, with a dimension vector $\mat{p} = (p_1, ..., p_r)$ its convenient to write $\mat{K}_{(p_1, ..., p_r),(j)}$ as $\mat{K}_{\mat{p},(j)}$. Its relation to the classic Commutation matrix is given by
\begin{displaymath}
    \mat{K}_{\mat{p}, (j)} = \mat{I}_{\overline{p}_j} \otimes \mat{K}_{\underline{p}_j, p_j}
\end{displaymath}
where $\overline{p}_j = \prod_{k = j + 1}^r p_k$ and $\underline{p}_j = \prod_{k = 1}^{j - 1}p_k$ with an empty product set to $1$.
The generalized commutation matrix gives leads to a generalization of the Kronecker product commutation identity
\begin{displaymath}
    \bigotimes_{\substack{k = r\\k\neq j}}^{1}\mat{A}_k\otimes \mat{A}_j = \mat{K}_{\mat{p}, (j)}\Big(\bigotimes_{k = r}^1 \mat{A}_k\Big)\t{\mat{K}_{\mat{q}, (j)}}
\end{displaymath}
for arbitrary matrices $\mat{A}_k$ of dimensions $p_k\times q_k$, $k \in[r]$ which are collected in the dimension vectors $\mat{p} = (p_1, ..., p_r)$ and $\mat{q} = (q_1, ..., q_r)$. Next the \emph{symmetrizer} $\mat{N}_p$ is a $p^2\times p^2$ matrix such that for any $p\times p$ matrix $\mat{A}$
\begin{displaymath}
    \mat{N}_p \vec{\mat{A}} = \frac{1}{2}(\vec{\mat{A}} + \vec{\t{\mat{A}}}).
\end{displaymath}
Another matrix which might come in handy is the \emph{selection matrix} $\mat{S}_p$ of dimensions $p^2\times p$ which selects the diagonal elements of a $p\times p$ matrix $\mat{A}$ from its vectorization
\begin{displaymath}
    \mat{S}_p\vec{\mat{A}} = \diag{\mat{A}}
\end{displaymath}
where $\diag{\mat{A}}$ denotes the vector of diagonal elements of $\mat{A}$.

For two matrices $\mat A$ of dimensions $a_1\times a_2$ and $\mat B$ of dimensions $b_1\times b_2$ holds
\begin{equation}\label{eq:vecKron}
    \vec(\mat A\otimes\mat B) = (\mat{I}_{a_2}\otimes\mat{K}_{b_2,a_1}\otimes\mat{I}_{b_1})(\vec\mat A\otimes\vec\mat B).
\end{equation}

\begin{align*}
    \pinv{\mat{D}_p} &= (\t{\mat{D}_p}\mat{D}_p)^{-1}\t{\mat{D}_p} \\
    \pinv{\mat{D}_p}\mat{D}_p &= \mat{I}_{p(p+1)/2} \\
    \mat{D}_p\pinv{\mat{D}_p} &= \mat{N}_{p} \\
    \t{\mat{K}_{p,q}} &= \mat{K}_{p,q}^{-1} = \mat{K}_{q,p} \\
    \t{\mat{K}_{\mat{p},(j)}} &= \mat{K}_{\mat{p},(j)}^{-1}
\end{align*}

\section{Further Identities}
\begin{displaymath}
    \tr(\mat{A} \mat{B} \mat{C} \mat{D}) = \t{(\vec{\t{\mat{B}}})}(\t{\mat{A}}\otimes \mat{C})\vec{\mat{D}}
\end{displaymath}

Let $\ten{A}, \ten{B}$ be two tensors of order $r$, then
\begin{displaymath}
    \ten{A}\otimes\ten{B} = (\ten{A}\circ\ten{B})_{((2r - 1, 2r - 3, ..., 1), (2r, 2r - 2, ..., 2))}
\end{displaymath}
where $\circ$ is the outer product. For example considure two matrices $\mat{A}, \mat{B}$, the above simplifies to
\begin{displaymath}
    \mat{A}\otimes\mat{B} = (\mat{A}\circ\mat{B})_{((3, 1), (4, 2))}.
\end{displaymath}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
                          \section{Matrix Calculus}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Let $f$ be an $r$ times differentiable function, then
\begin{displaymath}
    \d^r f(\mat{X}) = \ten{F}(\mat{X})\mlm{k = 1}{r} \vec{\d\mat{X}}
    \qquad\Leftrightarrow\qquad
    \D^r f(\mat{X}) \equiv \frac{1}{r!}\sum_{\sigma\in\perm{r}}\ten{F}(\mat{X})_{(\sigma)}
\end{displaymath}
where $\perm{r}$ denotes the set of all permutations of $[r]$.

\begin{example}[Derivative with respect to a symmertic matrix]
    Suppose we have a function $\mat{F}(\mat{X})$ where $\mat{X}$ is a symmetric $p\times p$ matrix. We want to find the derivative of $\mat{F}$ with respect to $\mat{X}$ under the symmetry constraint. As we know that for symmetric $\mat{X}$ holds $\vec{X} = \mat{D}_p\vech{X}$ we get in conjunction of $\mat{F}$ being only a function of $\vech{X}$ the differential as
    \begin{displaymath}
        \d\mat{F}(\mat{X})
            = \d\mat{F}(\mat{D}_p\vech{\mat{X}})
            = \D\mat{F}(\mat{X})\mat{D}_p\d\vech{\mat{X}}
            = \D\mat{F}(\mat{X})\mat{D}_p\D(\vech{\mat{X}})(\mat{X})\vec{\d\mat{X}}.
    \end{displaymath}
    To find the value of $\D(\vech{\mat{X}})(\mat{X})$ we look at its components
    \begin{displaymath}
        (\D(\vech{\mat{X}})(\mat{X}))_{i, j} = \frac{(\partial \vech{X})_i}{(\partial \vec{X})_j}.
    \end{displaymath}
    Via an explicit indexing of the half vectorization and the vectorization operation in reference to the matrix indices of $\mat{X}$ we get with $i(k, l)\in[p(p + 1) / 2]$ and $j(m, n)\in[p^2]$ where $1 \leq l\leq k \leq p$ and $m, n\in[p]$ the mapping
    \begin{align*}
        i(k, l) &= (k - 1)(p + 1) + 1 - \frac{k(k - 1)}{2} + l - k \\
        j(m, n) &= (n - 1)p + n + 1
    \end{align*}
    that
    \begin{displaymath}
        \frac{(\partial \vech{X})_{i(k,l)}}{(\partial \vec{X})_{j(m,n)}} = \begin{cases}
            1 & \text{iff }(k,l) = (m,n)\text{ or }(k,l) = (n,m) \\
            0 & \text{else}
        \end{cases}.
    \end{displaymath}
    This reveals already its explicit value as the transposed dublication matrix $\mat{D}_p$ matches the same pattern, meaning
    \begin{displaymath}
        \D(\vech{\mat{X}})(\mat{X}) = \t{\mat{D}_p}.
    \end{displaymath}
    This means that from the differential $\d\mat{F}(\mat{Y}) = \mat{G}(\mat{Y})\vec{\d\mat{Y}}$ where $\mat{Y}$ is an arbitrary square matrix we can identify the derivative of $\mat{F}$ with respect to a symmetric $\mat{X}$ as
    \begin{displaymath}
        \d\mat{F}(\mat{Y}) = \mat{G}(\mat{Y})\vec{\d\mat{Y}}
        \qquad\Rightarrow\qquad
        \D\mat{F}(\mat{X}) = \mat{G}(\mat{X})\mat{D}_p\t{\mat{D}_p}\text{ for }\mat{X} = \t{\mat{X}}.
    \end{displaymath}
\end{example}

\begin{example}
    We want to find the derivative with respect to any of the $r$ symmetric $p_j\times p_j$ matrices $\mat{\Omega}_j$ where $j = 1, ..., r$ of the Kronecker product
    \begin{displaymath}
        \mat{F} = \bigotimes_{k = r}^1 \mat{\Omega}_k.
    \end{displaymath}
    Therefore, denote
    \begin{align*}
        p &= \prod_{k = 1}^r p_k, & \overline{p}_j &= \prod_{k = j + 1}^r p_k, & \underline{p}_j &= \prod_{k = 1}^{j - 1} p_k, \\
        & & \overline{\mat{\Omega}}_j &= \bigotimes_{k = r}^{j+1}\mat{\Omega}_k, & \underline{\mat{\Omega}}_j &= \bigotimes_{k = j - 1}^{1}\mat{\Omega}_k
    \end{align*}
    which slightly simplifies the following. With this notation we have $p = \overline{p}_jp_j\underline{p}_j$ for any of the $j = 1, ..., r$. Furthermore, the matrices $\overline{\mat{\Omega}}_j$ and $\underline{\mat{\Omega}}_j$ are of dimensions $\overline{p}_j\times \overline{p}_j$ and $\underline{p}_j\times \underline{p}_j$, respectively. We start with the differential
    \begin{align*}
        \d\mat{F} &= \d\bigotimes_{k = r}^1 \mat{\Omega}_k
            = \sum_{j = 1}^r \bigotimes_{k = r}^{j+1}\mat{\Omega}_k\otimes\d\mat{\Omega}_j\otimes\bigotimes_{k = j - 1}^{1}\mat{\Omega}_k
            = \sum_{j = 1}^r \overline{\mat{\Omega}}_j\otimes\d\mat{\Omega}_j\otimes\underline{\mat{\Omega}}_j \\
            &= \sum_{j = 1}^r \mat{K}_{\overline{p}_j p_j,\underline{p}_j}(\underline{\mat{\Omega}}_j\otimes\overline{\mat{\Omega}}_j\otimes\d\mat{\Omega}_j)\mat{K}_{\underline{p}_j,\overline{p}_j p_j}
    \end{align*}
    By vectorizing this transforms to
    \begin{align*}
        \d\vec\mat{F} &= \sum_{j = 1}^r (\mat{K}_{\overline{p}_j p_j,\underline{p}_j}\otimes\mat{K}_{\overline{p}_jp_j,\underline{p}_j})\vec(\underline{\mat{\Omega}}_j\otimes\overline{\mat{\Omega}}_j\otimes\d\mat{\Omega}_j) \\
            &= \sum_{j = 1}^r (\mat{K}_{\overline{p}_j p_j,\underline{p}_j}\otimes\mat{K}_{\overline{p}_jp_j,\underline{p}_j})(\mat{I}_{\overline{p}_j\underline{p}_j}\otimes\mat{K}_{p_j,\overline{p}_j\underline{p}_j}\otimes\mat{I}_{p_j})(\vec(\underline{\mat{\Omega}}_j\otimes\overline{\mat{\Omega}}_j)\otimes\d\vec\mat{\Omega}_j) \\
            &= \sum_{j = 1}^r (\mat{K}_{\overline{p}_j p_j,\underline{p}_j}\otimes\mat{K}_{\overline{p}_jp_j,\underline{p}_j})(\mat{I}_{\overline{p}_j\underline{p}_j}\otimes\mat{K}_{p_j,\overline{p}_j\underline{p}_j}\otimes\mat{I}_{p_j})(\vec(\underline{\mat{\Omega}}_j\otimes\overline{\mat{\Omega}}_j)\otimes\mat{I}_{p_j^2})\,\d\vec\mat{\Omega}_j \\
    \end{align*}
    leading to
    \begin{displaymath}
        \D\mat{F}(\mat{\Omega}_j) = (\mat{K}_{\overline{p}_j p_j,\underline{p}_j}\otimes\mat{K}_{\overline{p}_jp_j,\underline{p}_j})(\mat{I}_{\overline{p}_j\underline{p}_j}\otimes\mat{K}_{p_j,\overline{p}_j\underline{p}_j}\otimes\mat{I}_{p_j})(\vec(\underline{\mat{\Omega}}_j\otimes\overline{\mat{\Omega}}_j)\otimes\mat{I}_{p_j^2})
    \end{displaymath}
    for each $j = 1, ..., r$. Note that the $p^2\times p^2$ matrices
    \begin{displaymath}
        \mat{P}_j = (\mat{K}_{\overline{p}_j p_j,\underline{p}_j}\otimes\mat{K}_{\overline{p}_jp_j,\underline{p}_j})(\mat{I}_{\overline{p}_j\underline{p}_j}\otimes\mat{K}_{p_j,\overline{p}_j\underline{p}_j}\otimes\mat{I}_{p_j})
    \end{displaymath}
    are permutations.
\end{example}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
                                \section{Stuff}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Let $X, Y$ be $p, q$ dimensional random variables, respectively. Furthermore, let $\E X = \mu_X$, $\E Y = \mu_Y$ as well as $\cov(X) = \mat{\Sigma}_X$ and $\cov(Y) = \mat{\Sigma}_Y$. Then define the standardized random variables $Z_X = \mat{\Sigma}_X^{-1/2}(X - \mu_X)$ and $Z_Y = \mat{\Sigma}_Y^{-1/2}(Y - \mu_Y)$. For the standardized variables holds $\E Z_X = 0_p$, $\E_Y = 0_q$ and for the co-variances we get $\cov(Z_X) = \mat{I}_p$ as well as $\cov(Z_Y) = \mat{I}_q$. Now we take a look at the cross-covariance between $X$ and $Y$
\begin{displaymath}
    \cov(X, Y)
        = \cov(X - \mu_X, Z - \mu_Z)
        = \cov(\mat{\Sigma}_X^{1/2} Z_X, \mat{\Sigma}_Y^{1/2} Z_Y)
        = \mat{\Sigma}_X^{1/2}\cov(Z_X, Z_Y)\mat{\Sigma}_Y^{1/2}.
\end{displaymath}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
                                \section{Proofs}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{proof}[Proof of Theorem~\ref{thm:sdr}]
    By Theorem~1 in \cite{sdr-BuraDuarteForzani2016} we have that
    \begin{displaymath}
        \mat{R}(\ten{X}) = \t{\mat{B}}(\mat{t}(\ten{X}) - \E\mat{t}(\ten{X}))
    \end{displaymath}
    is a sufficient reduction under the exponential family \eqref{eq:exp-family} where $\mat{B}\in\mathbb{R}^{p(p + 1)\times q}$ with $\Span(\mat{B}) = \Span(\{\mat{\eta}_Y - \E_{Y}\mat{\eta}_Y : Y\in\mathcal{S}_Y\})$. With $\E_Y\mat{\eta}_{Y,1} \equiv c_1\E[\overline{\ten{\eta}}_1 - \ten{F}_Y\times_{k\in[r]}\mat{\alpha}_k] = c_1 \overline{\ten{\eta}}_1$ cause $\E_Y\ten{F}_Y = 0$ and $\mat{\eta}_{y,2}$ does not depend on $y$ (regardless of the notation) we get
    \begin{displaymath}
        \mat{\eta}_Y - \E_{Y}\mat{\eta}_Y = \begin{pmatrix}
            \mat{\eta}_{Y,1} - \E_{Y}\mat{\eta}_{Y,1} \\
            \mat{\eta}_{Y,2} - \E_{Y}\mat{\eta}_{Y,2}
        \end{pmatrix} = \begin{pmatrix}
            c_1\vec(\ten{F}_Y\times_{k\in[r]}\mat{\alpha}_k) \\
            \mat{0}
        \end{pmatrix}.
    \end{displaymath}
    Noting that
    \begin{displaymath}
        c_1\vec(\ten{F}_Y\times_{k\in[r]}\mat{\alpha}_k)
            = c_1\Big( \bigotimes_{k = r}^{1}\mat{\alpha}_k \Big)\vec(\ten{F}_Y)
    \end{displaymath}
    we get
    \begin{displaymath}
        \mat{B} = \begin{pmatrix}
            c_1 \bigotimes_{k = r}^{1}\mat{\alpha}_k \\
            \mat{0}_{p^2\times q}
        \end{pmatrix}.
    \end{displaymath}
    Simplifying leads to
    \begin{displaymath}
        \t{\mat{B}}(\mat{t}(\ten{X}) - \E\mat{t}(\ten{X})) = c_1 \Big( \bigotimes_{k = r}^{1}\mat{\alpha}_k \Big)(\mat{t}_1(\ten{X}) - \E\mat{t}_1(\ten{X})).
    \end{displaymath}
    Now note $\Span(\mat{A}) = \Span(c \mat{A})$ for any matrix $\mat{A}$ and non-zero scalar $c$ as well as the definition $\mat{t}_1(\ten{X}) = \vec{\ten{X}}$ which concludes the proves.
\end{proof}

Before we start the prove of Theorem~\ref{thm:grad} an illustration of the different dimensions in one of the gradients;
\begin{displaymath}
    \underbrace{ \mat{D}_{p_j}\t{\mat{D}_{p_j}} }_{\makebox[0pt]{\scriptsize $p_j^2\times p_j^2$}}
    %
    \underbrace{%
        \overbrace{\reshape{(\mat{p}, \mat{p})}\!\!\Big(\sum_{i = 1}^n
            \underbrace{ (\mat{t}_2(\ten{X}_i) - \invlink_2(\mat{\eta}_{y_i}) }_{p^2\times 1}
        \Big)}^{\substack{\text{(tensor of order $2 r$)}\\p_1\times p_2\times ... \times p_r\times p_1\times p_2\times ... \times p_r}} \!\!\makebox[0pt]{\phantom{\Big)}}_{(j, r + j)}
    }_{\substack{p_j^2\times (p / p_j)^2\\\text{(matricized / $j^{\text{th}}$ axis as rows)}}}
    %
    \underbrace{%
        \vec \overbrace{ \bigotimes_{\substack{k = r\\k\neq j}}^{1}\mat{\Omega}_j }^{\makebox[0pt]{\scriptsize $(p/p_j)\times (p/p_j)$}}
    }_{\makebox[0pt]{\scriptsize $(p/p_j)^2\times 1$}}
\end{displaymath}

\begin{proof}[Proof of Theorem~\ref{thm:grad}]
    \todo{THIS}
\end{proof}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Distributions}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Tensor Normal Distribution}

\section{Tensor Normal Distribution}
Let $\ten{X}$ be a multi-dimensional array random variable of order $r$ with dimensions $p_1\times ... \times p_r$ written as
\begin{displaymath}
    \ten{X}\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r).
\end{displaymath}
Its density is given by
\begin{displaymath}
    f(\ten{X}) = \Big( \prod_{i = 1}^r \sqrt{(2\pi)^{p_i}|\mat{\Delta}_i|^{p / p_i}} \Big)^{-1}
        \exp\!\left( -\frac{1}{2}\langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle \right)
\end{displaymath}
where $p = \prod_{i = 1}^r p_i$. This is equivalent to the vectorized $\vec\ten{X}$ following a Multi-Variate Normal distribution
\begin{displaymath}
    \vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1).
\end{displaymath}

\begin{theorem}[Tensor Normal via Multi-Variate Normal]
    For a multi-dimensional random variable $\ten{X}$ of order $r$ with dimensions $p_1\times ..., p_r$. Let $\ten{\mu}$ be the mean of the same order and dimensions as $\ten{X}$ and the mode covariance matrices $\mat{\Delta}_i$ of dimensions $p_i\times p_i$ for $i = 1, ..., n$. Then the tensor normal distribution is equivalent to the multi-variate normal distribution by the relation
    \begin{displaymath}
        \ten{X}\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r)
            \qquad\Leftrightarrow\qquad
        \vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes ...\otimes \mat{\Delta}_1)
    \end{displaymath}
    where $p = \prod_{i = 1}^r p_i$.
\end{theorem}
\begin{proof}
    A straight forward way is to rewrite the Tensor Normal density as the density of a Multi-Variate Normal distribution depending on the vectorization of $\ten{X}$. First consider
    \begin{multline*}
        \langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle
            = \t{\vec(\ten{X} - \mu)}\vec((\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\}) \\
            = \t{\vec(\ten{X} - \mu)}(\mat{\Delta}_r^{-1}\otimes ...\otimes\mat{\Delta}_1^{-1})\vec(\ten{X} - \mu) \\
            = \t{(\vec{\ten{X}} - \vec{\mu})}(\mat{\Delta}_r\otimes ...\otimes\mat{\Delta}_1)^{-1}(\vec\ten{X} - \vec\mu).
    \end{multline*}
    Next, using a property of the determinant of a Kronecker product $|\mat{\Delta}_1\otimes\mat{\Delta}_2| = |\mat{\Delta}_1|^{p_2}|\mat{\Delta}_2|^{p_1}$ yields
    \begin{displaymath}
        |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1|
            = |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_2|^{p_1}|\mat{\Delta}_1|^{p / p_1}
    \end{displaymath}
    where $p = \prod_{j = 1}^r p_j$. By induction over $r$ the relation
    \begin{displaymath}
        |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1|
            = \prod_{i = 1}^r |\mat{\Delta}_i|^{p / p_i}
    \end{displaymath}
    holds for arbitrary order $r$. Substituting into the Tensor Normal density leads to
    \begin{align*}
        f(\ten{X}) = \Big( (2\pi)^p |\mat{\Delta}| \Big)^{-1/2}
        \exp\!\left( -\frac{1}{2}\t{(\vec\ten{X} - \vec\mu)}\mat{\Delta}^{-1}(\vec\ten{X} - \vec\mu) \right)
    \end{align*}
    with $\mat{\Delta} = \mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1$ which is the Multi-Variate Normal density of the $p$ dimensional vector $\vec\ten{X}$ with mean $\vec\mu$ and covariance $\mat{\Delta}$.
\end{proof}

When sampling from the Multi-Array Normal one way is to sample from the Multi-Variate Normal and then reshaping the result, but this is usually very inefficient because it requires to store the multi-variate covariance matrix which is very big. Instead, it is more efficient to sample $\ten{Z}$ as a tensor of the same shape as $\ten{X}$ with standard normal entries and then transform the $\ten{Z}$ to follow the Multi-Array Normal as follows
\begin{displaymath}
    \ten{Z}\sim\mathcal{TN}(0, \mat{I}_{p_1}, ..., \mat{I}_{p_r})
        \quad\Rightarrow\quad
    \ten{X} = \ten{Z}\times\{\mat{\Delta}_1^{1/2}, ..., \mat{\Delta}_r^{1/2}\} + \mu\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r).
\end{displaymath}
where the sampling from the standard Multi-Array Normal is done by sampling all of the elements of $\ten{Z}$ from a standard Normal.

\paragraph{Moments of the Multivariate Normal (in matrix notation):}
The moment generating function of the multivariate normal distribution with parameters $\mu, \Sigma$ is
\begin{displaymath}
    M(t) = \E e^{\t{t}X} = \exp(\t{t}\mu + \tfrac{1}{2}\t{t}\Sigma t).
\end{displaymath}
The differentials up to the 4'th are
\begin{align*}
    \d   M(t) &= M(t) \t{(\mu + \Sigma t)} \d{t} \\
    \d^2 M(t) &= \t{\d{t}} M(t) (\mu + \Sigma t)\t{(\mu + \Sigma t)} \d{t} \\
    \d^3 M(t) &= M(t) (\mu + \Sigma t)\circ [(\mu + \Sigma t)\circ (\mu + \Sigma t) + 3\Sigma]\mlm{k = 1}{3} \d{t} \\
    \d^4 M(t) &= M(t) (\mu + \Sigma t)\circ(\mu + \Sigma t)\circ[(\mu + \Sigma t)\circ(\mu + \Sigma t) + 6\Sigma)]\mlm{k = 1}{4} \d{t}
\end{align*}
Using the differentials to derivative identification identity
\begin{displaymath}
    \d^m f(t) = \ten{F}(t)\mlm{k = 1}{m}\d{t}
    \qquad\Leftrightarrow\qquad
    \D^m f(t) \equiv \frac{1}{m!}\sum_{\sigma\in\mathfrak{S}_m}\ten{F}(t)_{(\sigma)}
\end{displaymath}
in conjunction with simplifications gives the first four raw moments by evaluating at zero;
\begin{align*}
    M_1 = \D   M(t)|_{t = 0} &= \mu \\
    M_2 = \D^2 M(t)|_{t = 0} &= \mu\t{\mu} + \Sigma \\
    M_3 = \D^3 M(t)|_{t = 0} &= \mu\circ\mu\circ\mu + \mu\circ\Sigma + (\mu\circ\Sigma)_{((2), (1), (3))} + \Sigma\circ\mu \\
    M_4 = \D^4 M(t)|_{t = 0} &\equiv \frac{1}{4!}\sum_{\sigma\in\mathfrak{S}_4} (\mu\circ\mu\circ\Sigma + \Sigma\circ\Sigma + \Sigma\circ\mu\circ\mu)_{(\sigma)}
\end{align*}
which leads to the centered moments (which are also the covariances of the sufficient statistic $t(X)$)
\begin{align*}
    H_{1,1} &= \cov(t_1(X)\mid Y = y) \\
        &= \Sigma \\
    H_{1,2} &= \cov(t_1(X), t_2(X)\mid Y = y) \\
        &= (\mu\circ\Sigma)_{(3, (1, 2))} + (\mu\circ\Sigma)_{(3, (2, 1))} \\
        &= \t{\mu}\otimes\Sigma + \Sigma\otimes\t{\mu} \\
    H_{2,1} &= \cov(t_2(X), t_1(X)\mid Y = y) = \t{H_{1, 2}} \\
    H_{2,2} &= \cov(t_2(X)\mid Y = y) \\
        &=  (\mu\circ\mu\circ\Sigma + \Sigma\circ\Sigma + \Sigma\circ\mu\circ\mu)_{((1, 3), (2, 4))} +
            (\mu\circ\mu\circ\Sigma + \Sigma\circ\Sigma + \Sigma\circ\mu\circ\mu)_{((1, 3), (4, 2))} \\
            &\overset{???}{=} 2\Sigma\otimes\Sigma
                + \mu\otimes\Sigma\otimes\t{\mu}
                + \t{\mu}\otimes\Sigma\otimes\mu
                + \mu\t{\mu}\otimes\Sigma
                + \Sigma\otimes\mu\t{\mu}
\end{align*}

\end{document}