From 79f16c58dc21c1a0347ee6e61f3bb1b5b229a3b5 Mon Sep 17 00:00:00 2001
From: daniel <daniel@kapla.at>
Date: Tue, 25 Oct 2022 17:00:24 +0200
Subject: [PATCH] GMLM.tex: Fisher Information, add: ttt, fix: ttm - allow
 vector as p x 1 matrix, add: num_deriv2

---
 LaTeX/GMLM.tex                 | 1151 +++++++++++++++++++++++++-------
 tensorPredictors/NAMESPACE     |    2 +
 tensorPredictors/R/GMLM.R      |   50 +-
 tensorPredictors/R/num_deriv.R |   15 +
 tensorPredictors/R/ttm.R       |    1 +
 tensorPredictors/R/ttt.R       |   14 +
 6 files changed, 947 insertions(+), 286 deletions(-)
 create mode 100644 tensorPredictors/R/ttt.R

diff --git a/LaTeX/GMLM.tex b/LaTeX/GMLM.tex
index ae30daf..73f3d6f 100644
--- a/LaTeX/GMLM.tex
+++ b/LaTeX/GMLM.tex
@@ -1,36 +1,45 @@
-\documentclass[a4paper, 12pt]{article}
+\documentclass[a4paper, 10pt]{article}
 
 \usepackage[utf8]{inputenc}
 \usepackage[T1]{fontenc}
 \usepackage{fullpage}
-\usepackage{amsmath, amssymb, amstext, amsthm}
-\usepackage{bm}                         % \boldsymbol and italic corrections, ...
-\usepackage[pdftex]{hyperref}
+\usepackage{amsmath, amssymb, amstext, amsthm, scalerel, bm}
 \usepackage{makeidx}                    % Index (Symbols, Names, ...)
 \usepackage{xcolor, graphicx}           % colors and including images
 \usepackage{tikz}
-\usetikzlibrary{calc}
+\usepackage{algorithm, algpseudocode}   % Pseudo Codes / Algorithms
+\usepackage[pdftex]{hyperref}           % Load as last package! Redefines commands
+\usetikzlibrary{calc, 3d}
+
+% defines a new TiKz option `canvas is plane={O(#1,#2,#3)x(#4,#5,#6)y(#7,#8,#9)}`
+\makeatletter
+\tikzoption{canvas is plane}[]{\@setOxy#1}
+\def\@setOxy O(#1,#2,#3)x(#4,#5,#6)y(#7,#8,#9){%
+    \def\tikz@plane@origin{\pgfpointxyz{#1}{#2}{#3}}%
+    \def\tikz@plane@x{\pgfpointxyz{#4}{#5}{#6}}%
+    \def\tikz@plane@y{\pgfpointxyz{#7}{#8}{#9}}%
+    \tikz@canvas@is@plane%
+}
+\makeatother
+
 \usepackage[
     % backend=bibtex,
     style=authoryear-comp
 ]{biblatex}
-\usepackage{algorithm, algpseudocode}   % Pseudo Codes / Algorithms
 
 % Document meta into
-\title{Generalized Multi-Linear Model for the Quadratic Exponential Family}
+\title{Generalized Multi-Linear Modeling for Sufficient Dimension Reduction for the Quadratic Exponential Family}
 \author{Daniel Kapla}
 \date{\today}
 % Set PDF title, author and creator.
 \AtBeginDocument{
     \hypersetup{
-        pdftitle = {Generalized Multi-Linear Model for the Quadratic Exponential Family},
+        pdftitle = {Generalized Multi-Linear Modeling for Sufficient Dimension Reduction for the Quadratic Exponential Family},
         pdfauthor = {Daniel Kapla},
         pdfcreator = {\pdftexbanner}
     }
 }
 
-\makeindex
-
 % Bibliography resource(s)
 \addbibresource{main.bib}
 
@@ -48,14 +57,17 @@
 \newtheorem{remark}{Remark}
 
 % Define math macros
-\newcommand{\mat}[1]{\boldsymbol{#1}}
-\newcommand{\ten}[1]{\mathcal{#1}}
-\renewcommand{\vec}{\operatorname{vec}}
-\newcommand{\unvec}{\operatorname{vec^{-1}}}
-\newcommand{\reshape}[1]{\operatorname{reshape}_{#1}}
-\newcommand{\vech}{\operatorname{vech}}
-\newcommand{\rank}{\operatorname{rank}}
-\newcommand{\diag}{\operatorname{diag}}
+\newcommand*{\mat}[1]{\boldsymbol{#1}}
+\newcommand*{\ten}[1]{\mathcal{#1}}
+\renewcommand*{\vec}{\operatorname{vec}}
+\newcommand*{\unvec}{\operatorname{vec^{-1}}}
+\newcommand*{\reshape}[1]{\operatorname{reshape}_{#1}}
+\newcommand*{\vech}{\operatorname{vech}}
+\newcommand*{\rank}{\operatorname{rank}}
+\newcommand*{\diag}{\operatorname{diag}}
+\newcommand*{\perm}[1]{\textnormal{Perm}_{#1}}  % set of permutations of size #1
+\newcommand*{\len}[1]{\#{#1}}                   % length of #1
+\DeclareMathOperator*{\ttt}{\circledast}
 \DeclareMathOperator{\tr}{tr}
 \DeclareMathOperator{\var}{Var}
 \DeclareMathOperator{\cov}{Cov}
@@ -64,19 +76,65 @@
 % \DeclareMathOperator{\independent}{{\bot\!\!\!\bot}}
 \DeclareMathOperator*{\argmin}{{arg\,min}}
 \DeclareMathOperator*{\argmax}{{arg\,max}}
-\newcommand{\D}{\textnormal{D}}         % derivative
-\renewcommand{\d}{\textnormal{d}}       % differential
-\renewcommand{\t}[1]{{#1^{\prime}}}     % matrix transpose
-\newcommand{\pinv}[1]{{#1^{\dagger}}}   % `Moore-Penrose pseudoinverse`
-\newcommand{\invlink}{\widetilde{\mat{g}}}
+\newcommand*{\D}{\textnormal{D}}         % derivative
+\renewcommand*{\H}{\textnormal{H}}         % hessian
+\renewcommand*{\d}{\textnormal{d}}       % differential
+\renewcommand*{\t}[1]{{#1^{\prime}}}     % matrix transpose
+\newcommand*{\pinv}[1]{{#1^{\dagger}}}   % `Moore-Penrose pseudoinverse`
+\newcommand*{\invlink}{\widetilde{\mat{g}}}
 
 \newcommand{\todo}[1]{{\color{red}TODO: #1}}
 \newcommand{\effie}[1]{{\color{blue}Effie: #1}}
 
+
+%%% Custom operators with ether one or two arguments (limits)
+\makeatletter
+%%% Multi-Linear Multiplication
+% Save first argument as \arg@one
+\def\mlm#1{\def\arg@one{#1}\futurelet\next\mlm@i}
+% Check for second argument
+\def\mlm@i{\ifx\next\bgroup\expandafter\mlm@two\else\expandafter\mlm@one\fi}
+% specialization for one or two arguments, both versions use saved first argument
+\def\mlm@one{\mathchoice%
+    {\operatorname*{\scalerel*{\times}{\bigotimes}}_{\makebox[0pt][c]{$\scriptstyle \arg@one$}}}%
+    {\operatorname*{\scalerel*{\times}{\bigotimes}}_{\arg@one}}%
+    {\operatorname*{\scalerel*{\times}{\bigotimes}}_{\arg@one}}%
+    {\operatorname*{\scalerel*{\times}{\bigotimes}}_{\arg@one}}%
+}
+% this commands single argument is the second argument of \mlm
+\def\mlm@two#1{\mathchoice%
+    {\operatorname*{\scalerel*{\times}{\bigotimes}}_{\makebox[0pt][c]{$\scriptstyle \arg@one$}}^{\makebox[0pt][c]{$\scriptstyle #1$}}}%
+    {\operatorname*{\scalerel*{\times}{\bigotimes}}_{\arg@one}^{#1}}%
+    {\operatorname*{\scalerel*{\times}{\bigotimes}}_{\arg@one}^{#1}}%
+    {\operatorname*{\scalerel*{\times}{\bigotimes}}_{\arg@one}^{#1}}%
+}
+
+%%% Big Kronecker Product (with overflowing limits)
+% Save first argument as \arg@one
+\def\bigkron#1{\def\arg@one{#1}\futurelet\next\bigkron@i}
+% Check for second argument
+\def\bigkron@i{\ifx\next\bgroup\expandafter\bigkron@two\else\expandafter\bigkron@one\fi}
+% specialization for one or two arguments, both versions use saved first argument
+\def\bigkron@one{\mathchoice%
+    {\bigotimes_{\makebox[0pt][c]{$\scriptstyle \arg@one$}}}%
+    {\bigotimes_{\arg@one}}%
+    {\bigotimes_{\arg@one}}%
+    {\bigotimes_{\arg@one}}%
+}
+% this commands single argument is the second argument of \bigkron
+\def\bigkron@two#1{\mathchoice%
+    {\bigotimes_{\makebox[0pt][c]{$\scriptstyle \arg@one$}}^{\makebox[0pt][c]{$\scriptstyle #1$}}}%
+    {\bigotimes_{\arg@one}^{#1}}%
+    {\bigotimes_{\arg@one}^{#1}}%
+    {\bigotimes_{\arg@one}^{#1}}%
+}
+\makeatother
+
 % Pseudo Code Commands
 \newcommand{\algorithmicbreak}{\textbf{break}}
 \newcommand{\Break}{\State \algorithmicbreak}
 
+
 \begin{document}
 
 \maketitle
@@ -84,8 +142,8 @@
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \begin{abstract}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-    We propose a method for sufficient dimension reduction of Tensor-valued predictor (multi dimensional arrays) for regression or classification. We assume an Quadratic Exponential Family for a Generalized Linear Model in an inverse regression setting where the relation via a link is of a multi-linear nature.
-    Using a multi-linear relation allows to perform per-axis reductions which reduces the total number of parameters drastically for higher order Tensor-valued predictors. Under the Exponential Family we derive maximum likelihood estimates for the multi-linear sufficient dimension reduction of the Tensor-valued predictors. Furthermore, we provide an estimation algorithm which utilizes the Tensor structure allowing efficient implementations. The performance of the method is illustrated via simulations and real world examples are provided.
+    We consider regression and classification for \textit{general} response and tensor-valued predictors (multi dimensional arrays) and propose a \textit{novel formulation} for  method for sufficient dimension reduction.  Assumingof Tensor-valued predictor (multi dimensional arrays) for regression or classification. the distribution of the tensor-valued predictors given the response is in theWe assume an qQuadratic eExponential fFamily, we model the natural parameter   for a Generalized Linear Model in an inverse regression setting where the relation via a link is of as a multi-linear function of the responsnature.
+    ThisUsing a multi-linear relation allows to perform per-axis reductions that drastically which reduces the total number of parameters drastically for higher order tTensor-valued predictors. WUnder the Exponential Family we derive maximum likelihood estimates for the multi-linear sufficient dimension reduction and  of the Tensor-valued predictors. Furthermore, we provide a computationally efficient n estimation algorithm which leveragutilizes the tTensor structure allowing efficient implementations. The performance of the method is illustrated via simulations and real world examples are provided.
 \end{abstract}
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -94,15 +152,15 @@
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{Notation}
-Vectors are write as boldface lowercase letters (e.g. $\mat a$, $\mat b$), matrices use boldface uppercase or Greek letters (e.g. $\mat A$, $\mat B$, $\mat\alpha$, $\mat\Delta$). The identity matrix of dimensions $p\times p$ is denoted by $\mat{I}_p$ and the commutation matrix as $\mat{K}_{p, q}$ or $\mat{K}_p$ is case of $p = q$. Tensors, meaning multi-dimensional arrays of order at least 3, use uppercase calligraphic letters (e.g. $\ten{A}$, $\ten{B}$, $\ten{X}$, $\ten{Y}$, $\ten{F}$). Boldface indices (e.g. $\mat{i}, \mat{j}, \mat{k}$) denote multi-indices $\mat{i} = (i_1, ..., i_r)\in[\mat{d}]$ where the bracket notation is a shorthand for $[r] = \{1, ..., r\}$ which in conjunction with a multi-index as argument means $[\mat{d}] = [d_1]\times ... \times[d_K] = \{ (i_1, ..., i_r)\in\mathbb{N}^r : 1\leq i_k\leq d_k, \forall k = 1, ..., r  \}$.
+Vectors are write as boldface lowercase letters (e.g. $\mat a$, $\mat b$), matrices use boldface uppercase or Greek letters (e.g. $\mat A$, $\mat B$, $\mat\alpha$, $\mat\Delta$). The identity matrix of dimensions $p\times p$ is denoted by $\mat{I}_p$ and the commutation matrix as $\mat{K}_{p, q}$ or $\mat{K}_p$ is case of $p = q$. Tensors, meaning multi-dimensional arrays of order at least 3, use uppercase calligraphic letters (e.g. $\ten{A}$, $\ten{B}$, $\ten{X}$, $\ten{Y}$, $\ten{F}$). Boldface indices (e.g. $\mat{i}, \mat{j}, \mat{k}$) denote multi-indices $\mat{i} = (i_1, ..., i_r)\in[\mat{q}]$ where the bracket notation is a shorthand for $[r] = \{1, ..., r\}$ which in conjunction with a multi-index as argument means $[\mat{q}] = [q_1]\times ... \times[q_K] = \{ (i_1, ..., i_r)\in\mathbb{N}^r : 1\leq i_k\leq q_k, \forall k = 1, ..., r  \}$.
 
-Let $\ten{A} = (a_{i_1,...,i_r})\in\mathbb{R}^{d_1\times ...\times d_r}$ be an order\footnote{Also called rank, therefore the variable name $r$, but this term is \emph{not} used as it leads to confusion with the rank as in ``the rank of a matrix''.} $r$ tensor where $r\in\mathbb{N}$ is the number of modes or axis of $\ten{A}$. For matrices $\mat{B}_k\in\mathbb{R}^{p_k\times d_k}$ with $k\in[r] = \{1, 2, ..., r\}$ the \emph{multi-linear multiplication} is defined element wise as
+Let $\ten{A} = (a_{i_1,...,i_r})\in\mathbb{R}^{q_1\times ...\times q_r}$ be an order\footnote{Also called rank, therefore the variable name $r$, but this term is \emph{not} used as it leads to confusion with the rank as in ``the rank of a matrix''.} $r$ tensor where $r\in\mathbb{N}$ is the number of modes or axis of $\ten{A}$. For matrices $\mat{B}_k\in\mathbb{R}^{p_k\times q_k}$ with $k\in[r] = \{1, 2, ..., r\}$ the \emph{multi-linear multiplication} is defined element wise as
 \begin{displaymath}
-    (\ten{A}\times\{\mat{B}_1, ..., \mat{B}_r\})_{j_1, ..., j_r} = \sum_{i_1, ..., i_r = 1}^{d_1, ..., d_r} a_{i_1, ..., i_r}(B_{1})_{j_1, i_1} \cdots (B_{r})_{j_r, i_r}
+    (\ten{A}\times\{\mat{B}_1, ..., \mat{B}_r\})_{j_1, ..., j_r} = \sum_{i_1, ..., i_r = 1}^{q_1, ..., q_r} a_{i_1, ..., i_r}(B_{1})_{j_1, i_1} \cdots (B_{r})_{j_r, i_r}
 \end{displaymath}
 which results in an order $r$ tensor of dimensions $p_1\times ...\times p_k$. With this the \emph{$k$-mode product} between the tensor $\ten{A}$ with the matrix $\mat{B}_k$ is given by
 \begin{displaymath}
-    \mat{A}\times_k\mat{B}_k = \ten{A}\times\{\mat{I}_{d_1}, ..., \mat{I}_{d_{k-1}}, \mat{B}_{k}, \mat{I}_{d_{k+1}}, ..., \mat{I}_{d_r}\}.
+    \mat{A}\times_k\mat{B}_k = \ten{A}\times\{\mat{I}_{q_1}, ..., \mat{I}_{q_{k-1}}, \mat{B}_{k}, \mat{I}_{q_{k+1}}, ..., \mat{I}_{q_r}\}.
 \end{displaymath}
 Furthermore, the notation $\ten{A}\times_{k\in S}$ is a short hand for writing the iterative application if the mode product for all indices in $S\subseteq[r]$. For example $\ten{A}\times_{k\in\{2, 5\}}\mat{B}_k = \ten{A}\times_2\mat{B}_2\times_5\mat{B}_5$. By only allowing $S$ to be a set, this notation is unambiguous, because the mode products commutes for different modes $j\neq k\Rightarrow\ten{A}\times_j\mat{B}_j\times_k\mat{B}_k = \ten{A}\times_k\mat{B}_k\times_j\mat{B}_j$.
 
@@ -112,101 +170,53 @@ The \emph{inner product} between two tensors of the same order and dimensions is
 \end{displaymath}
 with which the \emph{Frobenius Norm} $\|\ten{A}\|_F = \sqrt{\langle\ten{A}, \ten{A}\rangle}$. Of interest is also the \emph{maximum norm} $\|\ten{A}\|_{\infty} = \max_{i_1, ..., i_K} a_{i_1, ..., i_K}$. Furthermore, the Frobenius and maximum norm are also used for matrices while for a vector $\mat{a}$ the \emph{2 norm} is $\|\mat{a}\|_2 = \sqrt{\langle\mat{a}, \mat{a}\rangle}$.
 
-Matrices and tensor can be \emph{vectorized} by the \emph{vectorization} operator $\vec$. Sometimes only the order of elements in an object are of interest, therefore we use the notation $\ten{A}\equiv \ten{B}$ for objects $\ten{A}, \ten{B}$ of any shape if and only if $\vec{\ten{A}} = \vec{\ten{B}}$. For tensors of order at least $2$ the \emph{flattening} (or \emph{unfolding} or \emph{matricization}) is a reshaping of the tensor into a matrix along an particular mode. For a tensor $\ten{A}$ of order $r$ and dimensions $d_1, ..., d_r$ the $k$-mode unfolding $\ten{A}_{(k)}$ is a $d_k\times \prod_{l=1, l\neq k}d_l$ matrix. For the tensor $\ten{A} = (a_{i_1,...,i_r})\in\mathbb{R}^{d_1, ..., d_r}$ the elements of the $k$ unfolded tensor $\ten{A}_{(k)}$ are
+Matrices and tensor can be \emph{vectorized} by the \emph{vectorization} operator $\vec$. Sometimes only the order of elements in an object are of interest, therefore we use the notation $\ten{A}\equiv \ten{B}$ for objects $\ten{A}, \ten{B}$ of any shape if and only if $\vec{\ten{A}} = \vec{\ten{B}}$. For tensors of order at least $2$ the \emph{flattening} (or \emph{unfolding} or \emph{matricization}) is a reshaping of the tensor into a matrix along an particular mode. For a tensor $\ten{A}$ of order $r$ and dimensions $q_1, ..., q_r$ the $k$-mode unfolding $\ten{A}_{(k)}$ is a $q_k\times \prod_{l=1, l\neq k}q_l$ matrix. For the tensor $\ten{A} = (a_{i_1,...,i_r})\in\mathbb{R}^{q_1, ..., q_r}$ the elements of the $k$ unfolded tensor $\ten{A}_{(k)}$ are
 \begin{displaymath}
-    (\ten{A}_{(k)})_{i_k, j} = a_{i_1, ..., i_r}\quad\text{ with }\quad j = 1 + \sum_{\substack{l = 1\\l \neq k}}^r (i_l - 1) \prod_{\substack{m = 1\\m\neq k}}^{l - 1}d_m.
+    (\ten{A}_{(k)})_{i_k, j} = a_{i_1, ..., i_r}\quad\text{ with }\quad j = 1 + \sum_{\substack{l = 1\\l \neq k}}^r (i_l - 1) \prod_{\substack{m = 1\\m\neq k}}^{l - 1}q_m.
 \end{displaymath}
 
-The rank of a tensor $\ten{A}$ of dimensions $d_1\times ...\times d_r$ is given by a vector $\rank{\ten{A}} = (a_1, ..., a_r)\in[d_1]\times...\times[d_r]$ where $a_k = \rank(\ten{A}_{(k)})$ is the usual matrix rank of the $k$ unfolded tensor.
+The rank of a tensor $\ten{A}$ of dimensions $q_1\times ...\times q_r$ is given by a vector $\rank{\ten{A}} = (a_1, ..., a_r)\in[q_1]\times...\times[q_r]$ where $a_k = \rank(\ten{A}_{(k)})$ is the usual matrix rank of the $k$ unfolded tensor.
 
-\subsection{Sufficient Dimension Reduction}
-\todo{TODO!!!}
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-                \section{Quadratic Exponential Family GLM}
+\subsection{The Model and Sufficient Reductions}\label{sec:GMLM}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-We propose a model based inverse regression method for estimation .... \todo{TODO!!!}
-
-\paragraph{Distribution}
-\begin{equation}\label{eq:exp-family}
-    f_{\mat{\theta}_y}(\ten{X}\mid Y = y) = h(\ten{X})\exp(\t{\mat{\eta}(\mat{\theta}_y)}\mat{t}(\ten{X}) - b(\mat{\theta}_y))
-\end{equation}
-
-\paragraph{(inverse) link}
-\begin{displaymath}
-    \invlink(\mat{\eta}(\mat{\theta}_y)) = \E_{\mat{\theta}_y}[\mat{t}(\ten{X})\mid Y = y]
-\end{displaymath}
-\paragraph{(multi) linear predictor} For
-\begin{displaymath}
-    \mat{\eta}_y = \mat{\eta}(\mat{\theta}_y) = \begin{pmatrix}
-        \mat{\eta}_1(\mat{\theta}_y) \\
-        \mat{\eta}_2(\mat{\theta}_y)
-    \end{pmatrix},\qquad
-    \mat{t}(\ten{X}) = \begin{pmatrix}
-        \mat{t}_1(\ten{X}) \\
-        \mat{t}_2(\ten{X})
-    \end{pmatrix} = \begin{pmatrix}
-        \vec{\ten{X}} \\
-        \vec{\ten{X}}\otimes\vec{\ten{X}}
-    \end{pmatrix}
-\end{displaymath}
-where
+The main relation for sufficient dimension reduction under an inverse regression setting is given by the following. If $(\ten{X}, Y)$ is jointly distributed and $\mat{R}$ is a measurable function of $\ten{X}$, a.k.a. a statistic. Then
 \begin{align*}
-    \mat{\eta}_1(\mat{\theta}_y) &= \mat{\eta}_{y,1} = c_1 \vec(\overline{\ten{\eta}}_1 + \ten{F}_y\times_{k\in[r]}\mat{\alpha}_k) \\
-    \mat{\eta}_2(\mat{\theta}_y) &= \mat{\eta}_{y,2} = c_2 \vec{\bigotimes_{k = r}^1 \mat{\Omega}_k}
+    Y\mid\ten{X} &\sim Y\mid \mat{R}(\ten{X})
+        & &\Leftrightarrow &
+    \ten{X}\mid(Y, \mat{R}(\ten{X})) &\sim \ten{X}\mid\mat{R}(\ten{X})
 \end{align*}
-with model parameters $\overline{\ten{\eta}}_1, \mat{\alpha}_1, ..., \mat{\alpha}_r, \mat{\Omega}_1, ..., \mat{\Omega}_r$ where $\overline{\ten{\eta}}_1$ is a $p_1\times ... \times p_r$ tensor, $\mat{\alpha}_j$ are $p_j\times q_j$ unconstrained matrices and $\mat{\Omega}_j$ are symmetric $p_j\times p_j$ matrices for each of the $j = 1, ..., r$ modes. Finally, $c_1$ and $c_2$ are known constants simplifying modeling for specific distributions.
+This means that a sufficient statistic $\mat{R}(\ten{X})$ for $Y$ in the inverse regression $\ten{X}\mid Y$ yields a sufficient reduction for $\ten{X}$ in the forward regression $Y\mid\ten{X}$.
 
+Let $(\ten{X}, Y)$ be jointly distributed with $\ten{X}$ be a random order $r$ tensor of dimensions $p_1\times ...\times p_r$ and $Y$ arbitrary as we assume a known tensor valued function of $Y$ denoted $\ten{F}_Y$ and has order $r$ and dimensions $q_1\times ... \times q_r$. Furthermore, $\E\ten{F}_Y = \mat{0}$ from now on and denote $p = \prod_{k = 1}^r p_k$ and $q = \prod_{k = 1}^r q_k$ as well as $\mat{p} = (p_1, ..., p_r)$ and $\mat{q} = (q_1, ..., q_r)$.
 
-\begin{theorem}[Log-Likelihood and Score]
-    For $n$ i.i.d. observations $(\ten{X}_i, y_i), i = 1, ..., n$ the log-likelihood has the form
-    \begin{displaymath}
-        l(\mat{\eta}_y) = \sum_{i = 1}^n(\log h(\ten{X}_i) + c_1\langle\overline{\ten{\eta}}_1 + \ten{F}_{y_i}\times_{k\in[r]}\mat{\alpha}_k, \ten{X}_i \rangle + c_2\langle\ten{X}_i\times_{k\in[r]}\mat{\Omega}_k, \ten{X}_i \rangle - b(\mat{\eta}_{y_i})).
-    \end{displaymath}
-    The gradients with respect to the GLM parameters $\overline{\ten{\eta}}_1$, $\mat{\alpha}_j$ and $\mat{\Omega}_j$ for $j = 1, ..., r$ are given by
-    \begin{align*}
-        % \nabla_{\overline{\ten{\eta}}_1}l &= c_1\sum_{i = 1}^n \reshape{\mat{p}}(\mat{t}_1(\ten{X}_i) - \invlink_1(\mat{\eta}_{y_i})), \\
-        \nabla_{\overline{\ten{\eta}}_1}l &\equiv c_1\sum_{i = 1}^n (\mat{t}_1(\ten{X}_i) - \invlink_1(\mat{\eta}_{y_i})), \\
-        \nabla_{\mat{\alpha}_j}l &= c_1 \sum_{i = 1}^n \reshape{\mat{p}}(\mat{t}_1(\ten{X}_i) - \invlink_1(\mat{\eta}_{y_i}))_{(j)}\t{(\ten{F}_{y_i}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}}, \\
-        \nabla_{\mat{\Omega}_j}l &\equiv c_2 \mat{D}_{p_j}\t{\mat{D}_{p_j}} \reshape{(\mat{p}, \mat{p})}\!\!\Big(\sum_{i = 1}^n(\mat{t}_2(\ten{X}_i) - \invlink_2(\mat{\eta}_{y_i}))\Big)_{(j, r + j)}\vec\bigotimes_{\substack{k = r\\k\neq j}}^{1}\mat{\Omega}_k
-    \end{align*}
-\end{theorem}
+We assume that $\ten{X}\mid Y$ follows a quadratic exponential family distribution. The restriction to a \emph{quadratic} exponential family is to properly model the tensor structure of $\ten{X}$. The pdf (pmf) with parameters $\mat{\theta}_y$ and natural parameters $\mat{\eta}_y = \mat{\eta}(\mat{\theta}_y)$ is given by
+\begin{align}
+    f_{\mat{\theta}_y}(\ten{X}\mid Y = y)
+        &= h(\ten{X})\exp(\t{\mat{\eta}(\mat{\theta}_y)}\mat{t}(\ten{X}) - b(\mat{\theta}_y)) \nonumber \\
+        &= h(\ten{X})\exp(\t{\mat{\eta}_{y,1}}\vec(\ten{X}) + \t{\mat{\eta}_{y,2}}(\vec(\ten{X})\otimes\vec(\ten{X})) - b(\mat{\theta}_y)) \nonumber \\
+        &= h(\ten{X})\exp(\t{\mat{\eta}_{y,1}}\mat{t}_1(\ten{X}) + \t{\mat{\eta}_{y,2}}\mat{t}_2(\ten{X}) - b(\mat{\theta}_y)).\label{eq:exp-family}
+\end{align}
+Under \eqref{eq:exp-family} the first natural parameter vector $\mat{\eta}_{y,1}$ is a $p$ dimensional vector while the second natural parameter vector $\mat{\eta}_{y,2}$ consists of $p^2$ elements. This is not the most compact form as its possible to only use $p(p + 1) / 2$ values by using the $\vech(\vec(X)\t{\vec(X)})$ as statistic for the second natural parameter. The reason for choosing $\mat{t}_2(\ten{X}) = \vec(\ten{X})\otimes\vec(\ten{X})$ instead is of technical nature as it simplifyes modeling the tensor structure of $\ten{X}$.
 
+In analogy to a GLM we model the natural parameters as a multi-linear function of $\ten{F}_y$, which is a tensor of dimensions $q_1\times ... \times q_r$ of known functions of $y$ such that $\E_Y\ten{F}_Y = 0$. 
+\begin{equation}\label{eq:eta_y1}
+    \mat{\eta}_1(\mat{\theta}_y) = \mat{\eta}_{y,1} \equiv c_1 (\overline{\ten{\eta}}_1 + \ten{F}_y\times_{k\in[r]}\mat{\alpha}_k)
+\end{equation}
+where $\overline{\ten{\eta}}_1$ is a intercept parameter of the same shape as $\ten{X}$ while the linear predictors $\mat{\alpha}_k$, for $k = 1, ..., r$, are unconstraint $p_k\times q_k$ dimensional matrices. The scalar $c_1$ is known and eases modeling for specific distributions. The second natural parameter vector $\mat{\eta}_{y,2}$ is assumed to have the form
+\begin{equation}\label{eq:eta_y2}
+    \mat{\eta}_2(\mat{\theta}_y) = \mat{\eta}_{y,2} \equiv c_2 \bigotimes_{k = r}^1 \mat{\Omega}_k
+\end{equation}
+with $\mat{\Omega}_k$ beeing $p_k\times p_k$ symmetric matrices for $k = 1, ..., k$.
 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-                \section{Sufficient Dimension Reduction}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-By Theorem~1 in \cite{sdr-BuraDuarteForzani2016} we have that
+In a classical GLM we also have an invertible link function $\mat{g}$ connectiong the natrual parameters to the expectation of the exponential family statistis as $\mat{\eta}_y = \mat{g}(\E[\mat{t}(\ten{X}) \mid Y = y])$. Such a link may not exist, but for our purposes its suffices to have the ``inverse'' link. The ``inverse'' link $\invlink$ does exist as the natural parameters fully describe the distribution
 \begin{displaymath}
-    \mat{R}(\ten{X}) = \t{\mat{B}}(\mat{t}(\ten{X}) - \E\mat{t}(\ten{X}))
+    \invlink(\mat{\eta}_y) = \E[\mat{t}(\ten{X}) \mid Y = y]
 \end{displaymath}
-is a sufficient reduction where $\mat{B}\in\mathbb{R}^{p(p + 1)\times q}$ with $\Span(\mat{B}) = \Span(\{\mat{\eta}_Y - \E_{Y}\mat{\eta}_Y : Y\in\mathcal{S}_Y\})$. With $\E_Y\mat{\eta}_{Y,1} \equiv c_1\E[\overline{\ten{\eta}}_1 - \ten{F}_Y\times_{k\in[r]}\mat{\alpha}_k] = c_1 \overline{\ten{\eta}}_1$ cause $\E_Y\ten{F}_Y = 0$ and $\mat{\eta}_{y,2}$ does not depend on $y$ (regardless of the notation) we get
-\begin{displaymath}
-    \mat{\eta}_Y - \E_{Y}\mat{\eta}_Y = \begin{pmatrix}
-        \mat{\eta}_{Y,1} - \E_{Y}\mat{\eta}_{Y,1} \\
-        \mat{\eta}_{Y,2} - \E_{Y}\mat{\eta}_{Y,2}
-    \end{pmatrix} = \begin{pmatrix}
-        c_1\vec(\ten{F}_Y\times_{k\in[r]}\mat{\alpha}_k) \\
-        \mat{0}
-    \end{pmatrix}.
-\end{displaymath}
-Noting that
-\begin{displaymath}
-    c_1\vec(\ten{F}_Y\times_{k\in[r]}\mat{\alpha}_k)
-        = c_1\Big( \bigotimes_{k = r}^{1}\mat{\alpha}_k \Big)\vec(\ten{F}_Y)
-\end{displaymath}
-we get
-\begin{displaymath}
-    \mat{B} = \begin{pmatrix}
-        c_1 \bigotimes_{k = r}^{1}\mat{\alpha}_k \\
-        \mat{0}_{p^2\times q}
-    \end{pmatrix}.
-\end{displaymath}
-Simplifying leads to
-\begin{displaymath}
-    \t{\mat{B}}(\mat{t}(\ten{X}) - \E\mat{t}(\ten{X})) = c_1 \Big( \bigotimes_{k = r}^{1}\mat{\alpha}_k \Big)(\mat{t}_1(\ten{X}) - \E\mat{t}_1(\ten{X})).
-\end{displaymath}
-Now note $\Span(\mat{A}) = \Span(c \mat{A})$ for any matrix $\mat{A}$ and non-zero scalar $c$ as well as the definition $\mat{t}_1(\ten{X}) = \vec{\ten{X}}$ which proves the following.
+which we also split into two parts as $\invlink_1(\mat{\eta}_y) = \E[\mat{t}_1(\ten{X}) \mid Y = y] \equiv \E[\ten{X} \mid Y = y]$ and $\invlink_2(\mat{\eta}_y) = \E[\mat{t}_2(\ten{X}) \mid Y = y] = \E[\vec(\ten{X})\otimes\vec(\ten{X}) \mid Y = y]$.
+
+We denote this GLM like model as \emph{generalized multi-linear model} (GMLM).
 
 \begin{theorem}[SDR]\label{thm:sdr}
     A sufficient reduction for the regression $Y\mid \ten{X}$ under the quadratic exponential family inverse regression model \eqref{eq:exp-family} is given by
@@ -219,7 +229,69 @@ Now note $\Span(\mat{A}) = \Span(c \mat{A})$ for any matrix $\mat{A}$ and non-ze
 \end{theorem}
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-                    \section{Examples}
+                        \section{Estimation}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+For parameter estimation of the GMLM parameters from Section~\ref{sec:GMLM} we use maximum likelihood estimation.
+
+\begin{theorem}[Log-Likelihood and Score]\label{thm:grad}
+    For $n$ i.i.d. observations $(\ten{X}_i, y_i), i = 1, ..., n$ the log-likelihood has the form
+    \begin{equation}\label{eq:likelihood}
+        l(\mat{\Theta}_y) = \sum_{i = 1}^n(\log h(\ten{X}_i) + c_1\langle\overline{\ten{\eta}}_1 + \ten{F}_{y_i}\times_{k\in[r]}\mat{\alpha}_k, \ten{X}_i \rangle + c_2\langle\ten{X}_i\times_{k\in[r]}\mat{\Omega}_k, \ten{X}_i \rangle - b(\mat{\eta}_{y_i})).
+    \end{equation}
+    with $\mat{\Theta}_y$ being the collection of all GMLM parameters $\overline{\mat{\eta}}$ and $\mat{\alpha}_k, \mat{\Omega}_k$ for $k = 1, ..., r$.
+    The partial gradients with respect to the GMLM parameters are given by
+    \begin{align*}
+        % \nabla_{\overline{\ten{\eta}}_1}l &= c_1\sum_{i = 1}^n \reshape{\mat{p}}(\mat{t}_1(\ten{X}_i) - \invlink_1(\mat{\eta}_{y_i})), \\
+        \nabla_{\overline{\ten{\eta}}_1}l &\equiv c_1\sum_{i = 1}^n (\mat{t}_1(\ten{X}_i) - \invlink_1(\mat{\eta}_{y_i})), \\
+        \nabla_{\mat{\alpha}_j}l &= c_1 \sum_{i = 1}^n \reshape{\mat{p}}(\mat{t}_1(\ten{X}_i) - \invlink_1(\mat{\eta}_{y_i}))_{(j)}\t{(\ten{F}_{y_i}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}}, \\
+        \nabla_{\mat{\Omega}_j}l &\equiv c_2 \mat{D}_{p_j}\t{\mat{D}_{p_j}} \reshape{(\mat{p}, \mat{p})}\!\!\Big(\sum_{i = 1}^n(\mat{t}_2(\ten{X}_i) - \invlink_2(\mat{\eta}_{y_i}))\Big)_{(j, r + j)}\vec\bigotimes_{\substack{k = r\\k\neq j}}^{1}\mat{\Omega}_k
+    \end{align*}
+\end{theorem}
+
+A straight forward idea for parameter estimation is to use Gradient Descent. For pure algorithmic speedup, by only changing the update rule but \emph{not} the gradient denoted $\nabla_{\mat{\Theta}}l$ of the objective function, we use Nesterov Accelerated Gradient Descent with an internal Line Search to determine the step size in each iteration, Algorithm~\ref{alg:NAGD}.
+
+% Denote with $l(\mat{\Theta}\mid \ten{X}, \ten{F}_y)$ the negative log-likelihood \eqref{eq:likelihood} which is minimization objective from the view of the Algorithm~\ref{alg:NAGD}. 
+
+\begin{algorithm}[ht]
+    \caption{\label{alg:NAGD}Nesterov Accelerated Gradient Descent}
+    \begin{algorithmic}[1]
+        \State Objective: $l(\Theta \mid \ten{X}, \ten{F}_y)$
+        \State Arguments: Order $r + 1$ tensors $\ten{X}$, $\ten{F}$
+        \State Initialize: Parameters $\Theta^{(0)}$, $0 < c, \delta^{(1)}$ and $0 < \gamma < 1$
+        \\
+        \State $t \leftarrow 1$
+            \Comment{step counter}
+        \State $\mat{\Theta}^{(1)} \leftarrow \mat{\Theta}^{(0)}$
+            \Comment{artificial first step}
+        \State $(m^{(0)}, m^{(1)}) \leftarrow (0, 1)$
+            \Comment{momentum extrapolation weights}
+        \\
+        \Repeat \Comment{repeat untill convergence}
+            \State $\mat{M} \leftarrow \mat{\Theta}^{(t)} + \frac{m^{(t - 1)} - 1}{m^{(t)}}(\mat{\Theta}^{(t)} - \mat{\Theta}^{(t - 1)})$ \Comment{momentum extrapolation}
+            \For{$\delta = \gamma^{-1}\delta^{(t)}, \delta^{(t)}, \gamma\delta^{(t)}, \gamma^2\delta^{(t)}, ...$} \Comment{Line Search}
+                \State $\mat{\Theta}_{\text{temp}} \leftarrow \mat{M} + \delta \nabla_{\mat{\Theta}} l(\mat{M})$
+                \If{$l(\mat{\Theta}_{\text{temp}}) \leq l(\mat{\Theta}^{(t - 1)}) - c \delta \|\nabla_{\mat{\Theta}} l(\mat{M})\|_F^2$} \Comment{Armijo Condition}
+                    \State $\mat{\Theta}^{(t + 1)} \leftarrow \mat{\Theta}_{\text{temp}}$
+                    \State $\delta^{(t + 1)} \leftarrow \delta$
+                    \Break
+                \EndIf
+            \EndFor
+            \State $m^{(t + 1)} \leftarrow \frac{1 + \sqrt{1 + (2 m^{(t)})^2}}{2}$ \Comment{update extrapolation weights}
+            \State $t \leftarrow t + 1$
+        \Until converged
+    \end{algorithmic}
+\end{algorithm}
+
+Algorithm~\ref{alg:NAGD} requires initial parameter estimates. Therefore, we choose a simple heuristic approach by starting with $\mat{\Omega}_k^{(0)} = \mat{I}_{p_k}$ and $\overline{\ten{\eta}}_1^{(0)}$ dependent on the particular distribution. For example for a Tensor Normal distribution the sample mean is a good choice while for the Ising model $0$ has nice properties. The interesting part are the multi-linear predictors $\mat{\alpha}_k$. Therefore, let $\mat{\Sigma}_k = \frac{q_k}{n q}\sum_{i = 1}^n {\ten{F}_{y_i}}_{(k)}\t{{\ten{F}_{y_i}}_{(k)}}$ and $\mat{\Delta}_k = \frac{p_k}{n p}\sum_{i = 1}^n {\ten{X}_{i}}_{(k)}\t{{\ten{X}_{i}}_{(k)}}$ mode wise sample covariance estimates. For both we take the $s_k = \min(p_k, q_k)$ order SVD approximation as $\mat{\Delta}_k \approx \mat{U}_k\mat{D}_k\t{\mat{U}_k}$ and $\mat{\Sigma}_k \approx \mat{V}_k\mat{S}_k\t{\mat{V}_k}$. The approximate SVD gives us the $s_k$ first singular vectors with dimensions $\mat{U}_k(p_k\times s_k), \mat{D}_k(s_k\times s_k)$ and $\mat{V}_k(q_k\times s_k), \mat{S}_k(s_k\times s_k)$. Then we initialize
+%%%%%%%%%%%%%%%%%%%%%%%%% TODO: !!!! \todo{!!!!!}
+\begin{displaymath}
+    \mat{\alpha}_k^{(0)} = \mat{U}_k\sqrt{\mat{D}_k}\sqrt{\mat{S}_k}\t{\mat{V}_k}.
+\end{displaymath}
+
+% An alternative is to use Iterative Re-weighted least Squares, but his requires the Fisher Information which has dimensions $s\times s$ for $s = p + \sum_{k = 1}^r (p_k q_k + p_k^2)$. This means that for the computation of the Fisher Information we have a computational complexity of $\mathcal{O}()$
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+                        \section{Examples}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 We illustrate the SDR method on two special cases, first the Tensor Normal distribution and second on the Multi-Variate Bernoulli distribution with vector, matrix and tensor valued predictors.
 
@@ -283,64 +355,6 @@ consisting of the first and second (uncentered) vectorized moments of the tensor
         &= \bigotimes_{k = r}^{1}\mat{\Omega}_k^{-1} + \vec(\ten{\mu}_y)\t{\vec(\ten{\mu}_y)}
 \end{align*}
 
-For estimation purposes it's also of interest to express the log-partition function $b$ in terms of the natural parameters or the GLM parameters which has the form
-\begin{displaymath}
-    b(\mat{\eta}_y) = \frac{1}{2}\t{\mat{\eta}_{y, 1}}(\unvec(-2\mat{\eta}_{y, 2}))^{-1} - \frac{1}{2}\log|\unvec(-2\mat{\eta}_{y, 2})|.
-\end{displaymath}
-
-
-Denote the Residuals as
-\begin{displaymath}
-    \ten{R}_i = \ten{X}_i - \ten{\mu}_{y_i}
-\end{displaymath}
-then with $\overline{\ten{R}} = \frac{1}{n}\sum_{i = 1}^n \ten{R}_i$ we get
-\begin{align*}
-    \nabla_{\overline{\eta}_1} l &= \overline{\ten{R}}, \\
-    \nabla_{\mat{\alpha}_j} l &= \frac{1}{n}\ten{R}_{(j)}\t{(\ten{F}_y\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}}, \\
-    \D l(\mat{\Omega}_j) &= \frac{1}{2}\t{\vec\Big(\frac{p}{p_j}\mat{\Omega}_j^{-1} - (\ten{X} + \mu_y)_{(j)}\t{(\ten{R}\times_{k\in[r]\backslash j}\mat{\Omega}_k)_{(j)}}\Big)}\mat{D}_{p_j}\t{\mat{D}_{p_j}}
-\end{align*}
-
-% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-% \subsubsection{Initial Values}
-% First we set the gradient with respect to $\overline{\ten{\eta}}_1$ to zero
-% \begin{gather*}
-%     0 \overset{!}{=} \nabla_{\overline{\ten{\eta}}_1}l = c_1\sum_{i = 1}^n (\ten{X}_i - \ten{\mu}_i) \\
-%     \overline{\ten{X}} = (\overline{\ten{\eta}}_1 + \overline{\ten{F}_y}\times_{k\in[r]}\mat{\alpha}_{k})\times_{l\in[r]}\mat{\Omega}_{l}^{-1} \\
-%     \overline{\ten{X}}\times_{l\in[r]}\mat{\Omega}_{l} = \overline{\ten{\eta}}_1 + \overline{\ten{F}_y}\times_{k\in[r]}\mat{\alpha}_{k} \approx \overline{\ten{\eta}}_1 \\
-%     \overline{\ten{\eta}}_1^{(0)} = \overline{\ten{X}}\times_{k\in[r]}\mat{\Omega}_{k}^{(0)}
-% \end{gather*}
-% where the approximation is due to the assumption that $\E \ten{F}_y = 0$. For the initial values of the scatter matrices $\mat{\Omega}_{l}$ we simply ignore the relation to the response and simply estimate them as the marginal scatter matrices. These initial estimates overemphasize the variability in the reduction subspace. Therefore, we first compute the unscaled mode covariance estimates
-% \begin{displaymath}
-%     \widetilde{\mat{\Delta}}_j^{(0)} = \frac{p_j}{n p} (\ten{X} - \overline{\ten{X}})_{(j)}\t{(\ten{X} - \overline{\ten{X}})_{(j)}}.
-% \end{displaymath}
-% The next step is to scale them such that there Kronecker product has an appropriate trace
-% \begin{displaymath}
-%     \mat{\Delta}_j^{(0)} = \left(\frac{\|\ten{X} - \overline{\ten{X}}\|_F^2}{n \prod_{k = 1}^r \tr(\widetilde{\mat{\Delta}}_j^{(0)})}\right)^{1 / r} \widetilde{\mat{\Delta}}_j^{(0)}.
-% \end{displaymath}
-% Finally, the co-variances need to be inverted to give initial estimated of the scatter matrices
-% \begin{displaymath}
-%     \mat{\Omega}_j^{(0)} = (\mat{\Delta}_j^{(0)})^{-1}.
-% \end{displaymath}
-% The relay interesting part is to get initial estimates for the $\mat{\alpha}_j$ matrices. Setting the $\mat{\alpha}_j$ gradient to zero gives and substituting the initial estimates for $\overline{\ten{\eta}}_1$ and the $\mat{\Omega}_k$'s gives
-% \begin{gather*}
-%     0 \overset{!}{=} \nabla_{\mat{\alpha}_j}l = c_1 \sum_{i = 1}^n \reshape{\mat{p}}(\mat{t}_1(\ten{X}_i) - \mat{g}_1(\mat{\eta}_{y_i}))_{(j)}\t{(\ten{F}_{y_i}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}} \\
-%     (\ten{X} - \overline{\ten{X}})_{(j)}\t{(\ten{F}_y \times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}}
-%         = \mat{\Omega}_j^{(0)}\mat{\alpha}_j(\ten{F}_y\times_{k\in[r]\backslash j}\mat{\Omega}_k^{(0)}\mat{\alpha}_k)_{(j)}\t{(\ten{F}_y \times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}}
-% \end{gather*}
-% Now letting $\mat{\Sigma}_k$ be the mode co-variances of $\ten{F}_y$ and define $\ten{W}_y = \ten{F}_y\times_{k\in[r]}\mat{\Sigma}_k$ we get
-% \begin{gather*}
-%     (\ten{X} - \overline{\ten{X}})_{(j)}\t{(\ten{F}_y \times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}}
-%         = \mat{\Omega}_j^{(0)}\mat{\alpha}_j\mat{\Sigma}_j^{1/2}(\ten{W}_y\times_{k\in[r]\backslash j}\mat{\Omega}_k^{(0)}\mat{\alpha}_k\mat{\Sigma}_k^{1/2})_{(j)}\t{(\ten{W}_y \times_{k\in[r]\backslash j}\mat{\alpha}_k \mat{\Sigma}_{k}^{1/2})_{(j)}}\mat{\Sigma}_{j}^{1/2} \\
-%         = \mat{\Omega}_j^{(0)}\mat{\alpha}_j\mat{\Sigma}_j^{1/2}(\ten{W}_y)_{(j)}\Big(\mat{I}_n\otimes\bigotimes_{\substack{k = r\\k\neq j}}^{1}\mat{\Sigma}_k^{1/2}\t{\mat{\alpha}_k}\mat{\Omega}_k^{(0)}\mat{\alpha}_k\mat{\Sigma}_{k}^{1/2}\Big)\t{(\ten{W}_y)_{(j)}}\mat{\Sigma}_{j}^{1/2}.
-% \end{gather*}
-% Now we let $\mat{\alpha}_j^{(0)}$ be such that $\mat{\Sigma}_k^{1/2}\t{(\mat{\alpha}^{(0)}_k)}\mat{\Omega}_k^{(0)}\mat{\alpha}^{(0)}_k\mat{\Sigma}_{k}^{1/2} = \mat{I}_{p_j}$, which leads by substitution to
-% \begin{displaymath}
-%     (\ten{X} - \overline{\ten{X}})_{(j)}\t{(\ten{F}_y\times_{k\in[r]\backslash j}\mat{\alpha}^{(0)}_k)_{(j)}}
-%     = \mat{\Omega}_j^{(0)}\mat{\alpha}^{(0)}_j\mat{\Sigma}_j^{1/2}(\ten{W}_y)_{(j)}\t{(\ten{W}_y)_{(j)}}\mat{\Sigma}_{j}^{1/2}
-%     = \frac{p_j}{n p}\mat{\Omega}_j^{(0)}\mat{\alpha}^{(0)}_j\mat{\Sigma}_j
-% \end{displaymath}
-% \todo{Does this make sense?!?!?!}
-
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{Ising Model}
 The conditional Ising model for the inverse regression $\ten{X}\mid Y = y$ with $p (p + 1) / 2$ parameters $\mat{\theta}_y$ is given by
@@ -350,7 +364,7 @@ The conditional Ising model for the inverse regression $\ten{X}\mid Y = y$ with
         &= h(\ten{X})\exp(\t{\mat{{\eta}}(\mat{\theta}_y)}\mat{t}(\ten{X}) - b(\mat{\theta}_y))
 \end{align*}
 where $h(\ten{X}) = 1$ and $b(\mat{\theta}_y) = -\log p_0(\mat{\theta}(\mat{\eta}_y))$.
-Following the GMLM model we have model the natrual parameters as
+Following the GMLM model we have model the natural parameters as
 \begin{align*}
     \mat{\eta}_{y,1} &\equiv c_1 (\overline{\ten{\eta}}_1 + \ten{F}_y\times_{k\in[r]}\mat{\alpha}_k), &
     \mat{\eta}_{y,2} &\equiv c_2 \bigotimes_{k = r}^{1}\mat{\Omega}_k.
@@ -359,85 +373,695 @@ where we set the constants $c_1 = c_2 = 1$. This yields the following relation t
 \begin{displaymath}
     \mat{\theta}_y = \mat{\theta}(\mat{\eta}_y) = \vech(\diag(\mat{\eta}_{y,1}) + (1_{p\times p} - \mat{I}_p) \odot \reshape{(p, p)}(\mat{\eta}_{y,2}))
 \end{displaymath}
-Note that the diagonal elements of the $\mat{\Omega}_k$ are multiplied by $0$ which means they are ignored. This reflects the fact that under the Ising model (in general for the multivariate Bernoulli) holds $\E \ten{X}_{\mat{j}}\mid Y = \E \ten{X}_{\mat{j}}\ten{X}_{\mat{j}} \mid Y$ due to $0$ and $1$ entries only. Therefore our model overparamiterizes as the diagonal elements of $\mat{\Omega}_k$ and $\overline{\ten{\eta}}_1$ serve the same porpose. 
+Note that the diagonal elements of the $\mat{\Omega}_k$ are multiplied by $0$ which means they are ignored. This reflects the fact that under the Ising model (in general for the multivariate Bernoulli) holds $\E \ten{X}_{\mat{j}}\mid Y = \E \ten{X}_{\mat{j}}\ten{X}_{\mat{j}} \mid Y$ due to $0$ and $1$ entries only. Therefore our model over-parameterize as the diagonal elements of $\mat{\Omega}_k$ and $\overline{\ten{\eta}}_1$ serve the same purpose.
 
-% where $c_1, c_2$ are non-zero known (modeled) constants between $0$ and $1$ such that $c_1 + c_2 = 1$. The ``inverse'' link in then computed via the Ising model as the conditional expectation of all interactions
-% \begin{align*}
-%     \invlink_2(\mat{\eta}_y) \equiv \E_{\mat{\theta}_y}[\vec(\ten{X})\t{\vec(\ten{X})}\mid Y = y]
-% \end{align*}
-% which incorporates the first moment. In other words $\invlink_1(\mat{\eta}_y) = \diag(\E_{\mat{\theta}_y}[\vec(\ten{X})\t{\vec(\ten{X})}\mid Y = y])$.
+\section{Simulations}
 
-% \subsection{Ising Model}
-% For the inverse regression $\ten{X}\mid Y = y$ the Ising model probability mass function with $p (p + 1) / 2$ parameters $\mat{\theta}_y$ is given by
-% \begin{align*}
-%     P_{\mat{\theta}_y}(\ten{X}\mid Y = y)
-%         &= p_0(\mat{\theta}_y)\exp(\t{\vech(\vec(\ten{X})\t{\vec(\ten{X})})}\mat{\theta}_y) \\
-%         &= h(\ten{X})\exp(\t{\mat{{\eta}}(\mat{\theta}_y)}\mat{t}(\ten{X}) - b(\mat{\theta}_y))
-% \end{align*}
-% where $h(\ten{X}) = 1$ and $b(\mat{\theta}_y) = -\log p_0(\mat{\theta}(\mat{\eta}_y))$.
-% According to the GLM model we get
-% \begin{align*}
-%     \mat{\eta}_{y,1} &\equiv c_1 (\overline{\ten{\eta}}_1 + \ten{F}_y\times_{k\in[r]}\mat{\alpha}_k), &
-%     \mat{\eta}_{y,2} &\equiv c_2 \bigotimes_{k = r}^{1}\mat{\Omega}_k.
-% \end{align*}
-% which yields the following relation to the conditional Ising model parameters
-% \begin{displaymath}
-%     \mat{\theta}_y = \mat{\theta}(\mat{\eta}_y) = \vech(\diag(\mat{\eta}_{y,1}) + (2_{p\times p} - \mat{I}_p) \odot \reshape{(p, p)}(\mat{\eta}_{y,2}))
-% \end{displaymath}
-% where $c_1, c_2$ are non-zero known (modeled) constants between $0$ and $1$ such that $c_1 + c_2 = 1$. The ``inverse'' link in then computed via the Ising model as the conditional expectation of all interactions
-% \begin{align*}
-%     \invlink_2(\mat{\eta}_y) \equiv \E_{\mat{\theta}_y}[\vec(\ten{X})\t{\vec(\ten{X})}\mid Y = y]
-% \end{align*}
-% which incorporates the first moment. In other words $\invlink_1(\mat{\eta}_y) = \diag(\E_{\mat{\theta}_y}[\vec(\ten{X})\t{\vec(\ten{X})}\mid Y = y])$.
+\subsection{Simulation 1: Tensor Normal}
+$\ten{X}$ is a $2\times 3\times 5$ tensor, $y\in\{1, 2, ..., 6\}$ uniformly distributed and $\ten{F}_y$ is a $1\times 2\times 3$ tensor with $0$ or $1$ entries $(\vec{\ten{F}}_y)_j = \delta_{y,j}$. The GMLM parameters are simply $\overline{\ten{\eta}}_1 = 0$, the matrices $\mat{\alpha}_k$ are filled with independent standard normal entries and the $\Omega_k$ are $AR_{p_k}(0.5)$. The $X \mid Y = y$ was drawn from a Tensor Normal with mean $\mu_y = \ten{F}_y\times_{k\in[3]}\mat{\Omega}_k^{-1}\mat{\alpha}_k$ and co-variances $\mat{\Omega}_k^{-1}$.
 
+\begin{figure}[!ht]
+    \centering
+    \includegraphics[width = \textwidth]{sim-normal-20221012.png}
+    \caption{\label{fig:sim-normal}Simulation Normal}
+\end{figure}
+
+\subsection{Simulation 2: Small Ising}
+
+\begin{figure}[!ht]
+    \centering
+    \includegraphics[width = \textwidth]{sim-ising-small-20221012.png}
+    \caption{\label{fig:sim-ising-small}Simulation Ising Small}
+\end{figure}
 
-% The ``inverse'' link is given by
-% \begin{align*}
-%     \invlink_1(\mat{\eta}_y) &\equiv \E_{\mat{\theta}(\mat{\eta}_y)}[\ten{X} | Y = y] \\
-%     \invlink_2(\mat{\eta}_y) &\equiv \E_{\mat{\theta}(\mat{\eta}_y)}[\vec(\ten{X})\t{\vec(\ten{X})} | Y = y]
-% \end{align*}
-% and note that $\diag(\E_{\mat{\theta}(\mat{\eta}_y)}[\vec(\ten{X})\t{\vec(\ten{X})} | Y = y]) \equiv \E_{\mat{\theta}(\mat{\eta}_y)}[\ten{X} | Y = y]$.
-%
-% The gradients of the log-likelihood are now given by
-% \begin{align*}
-%     \nabla_{\overline{\ten{\eta}}_1} l
-%         &= \frac{1}{n}\sum_{i = 1}^n \ten{R}_i \\
-%     \nabla_{\mat{\alpha}_j} l
-%         &= \frac{1}{n}\ten{R}_{(j)}\t{(\ten{F}_y\times_{k\in[r]\backslash j}\mat{\alpha}_j)_{(j)}} \\
-%     \vec(\nabla_{\mat{\Omega}_j} l)
-%         &= \t{\vec( (\reshape{(\mat{p}, \mat{p})}(\overline{\mat{t}_2(\ten{X}_i)} - \E[\mat{t}_2(\ten{X})\mid Y = y_i]))_{(j, r + j)} \vec\bigotimes_{\substack{k = r\\k\neq j}}^{1}\mat{\Omega}_j)}\mat{D}_{p_j}\t{\mat{D}_{p_j}}
-% \end{align*}
-% using the notation $\overline{\mat{t}_2(\ten{X})} = \frac{1}{n}\sum_{i = 1}^n \mat{t}_2(\ten{X}_i) = \frac{1}{n}\sum_{i = 1}^n \vec(\ten{X}_i)\otimes \vec(\ten{X}_i)$.
 
 \printbibliography[heading=bibintoc,title={References}]
+
 \appendix
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\section{Fisher Information}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+We start by computing the \emph{observed information matrix} $\mathcal{J}(\mat{\Theta})$ at parameters $\mat{\Theta} = (\overline{\ten{\eta}}_1$, $\mat{\alpha}_1$, $...$, $\mat{\alpha}_r$, $\mat{\Omega}_1$, $...$, $\mat{\Omega}_r)$ given as a block matrix consisting of $(2 r + 1)^2$ blocks (althoug only $(2 r + 1)(r + 1)$ unique blocks by a block symmetry $\mathcal{J}_{i, j} = \t{\mathcal{J}_{j, i}}$)
+\begin{displaymath}
+    \mathcal{J}(\mat{\Theta}) = \begin{pmatrix}
+        \mathcal{J}_{1,1} &\mathcal{J}_{2,1} & \cdots & \mathcal{J}_{2 r + 1, 1} \\
+        \vdots & \vdots & \ddots & \vdots \\
+        \mathcal{J}_{2 r + 1,1} & \mathcal{J}_{2 r + 1,1} & \cdots & \mathcal{J}_{2 r + 1, 2 r + 1}
+    \end{pmatrix}
+\end{displaymath}
+where each individual block is given by
+\begin{displaymath}
+    \mathcal{J}_{j,l} = -\frac{\partial l(\Theta)}{\partial\t{(\vec{\Theta_j})}\partial(\vec{\Theta_l})}.
+\end{displaymath}
+For example $\mathcal{J}_{1,2} = -\frac{\partial l(\Theta)}{\partial\t{(\vec{\overline{\ten{\eta}}_1})}\partial(\vec{\mat{\alpha}_1})}$ and $\mathcal{J}_{2r + 1, 2r + 1} = -\H l(\mat{\Omega}_r)$.
+We start by restating the log-likelihood for a given single observation $(\ten{X}, \ten{Y})$ where $\ten{F}_y$ given by
+\begin{displaymath}
+    l(\mat{\Theta}) = \log h(\ten{X}) + c_1\langle\overline{\ten{\eta}}_1 + \ten{F}_{y}\times_{k\in[r]}\mat{\alpha}_k, \ten{X} \rangle + c_2\langle\ten{X}\times_{k\in[r]}\mat{\Omega}_k, \ten{X} \rangle - b(\mat{\eta}_{y})
+\end{displaymath}
+with
+\begin{align*}
+    \mat{\eta}_1(\mat{\theta}_y) &= \mat{\eta}_{y,1} \equiv c_1 (\overline{\ten{\eta}}_1 + \ten{F}_y\times_{k\in[r]}\mat{\alpha}_k) \\
+    \mat{\eta}_2(\mat{\theta}_y) &= \mat{\eta}_{y,2} \equiv c_2 \bigotimes_{k = r}^1 \mat{\Omega}_k.
+\end{align*}
+Now let
+\begin{displaymath}
+    \mat{H} = \H b(\mat{\eta}_y) = \begin{pmatrix}
+        \H b(\mat{\eta}_{y,1}) & \D(\D b(\mat{\eta}_{y,1}))(\mat{\eta}_{y,2}) \\ \D(\D b(\mat{\eta}_{y,2}))(\mat{\eta}_{y,1}) & \H b(\mat{\eta}_{y,2})
+    \end{pmatrix} = \begin{pmatrix}
+        \mat{H}_{1,1} & \mat{H}_{1,2} \\ \mat{H}_{2,1} & \mat{H}_{2,2}
+    \end{pmatrix}
+\end{displaymath}
+as well as tensor valued reshaped versions
+\begin{align*}
+    \ten{D}_{1} &= \reshape{\mat{p}}(\D b(\mat{\eta}_{y,1})) &
+    \ten{D}_{2} &= \reshape{(\mat{p}, \mat{p})}(\D b(\mat{\eta}_{y,2})) \\
+    \ten{H}_{1,1} &= \reshape{(\mat{p}, \mat{p})}(\mat{H}_{1,1}) &
+    \ten{H}_{1,2} &= \reshape{(\mat{p}, \mat{p}, \mat{p})}(\mat{H}_{1,2}) \\
+    \ten{H}_{2,1} &= \reshape{(\mat{p}, \mat{p}, \mat{p})}(\mat{H}_{2,1}) &
+    \ten{H}_{2,2} &= \reshape{(\mat{p}, \mat{p}, \mat{p}, \mat{p})}(\mat{H}_{2,2})
+\end{align*}
+Note that $\ten{D}_{1}$ is of order $r$, $\ten{D}_{2}$ and $\ten{H}_{1,1}$ are tensors of order $2 r$, $\ten{H}_{1,2}$ and $\ten{H}_{2,1}$ have order $3 r$ and $\ten{H}_{2,2}$ is of order $4 r$.
+We get the differentials as
+\begin{align*}
+    \d l(\overline{\ten{\eta}}_1) &= c_1(\langle\d\overline{\ten{\eta}}_1, \ten{X}\rangle - \D b(\mat{\eta}_{y,1})\vec{\d\overline{\ten{\eta}}_1}) \\
+    \d l(\mat{\alpha}_j) &= c_1(\t{\vec(\ten{X})} - \D b(\mat{\eta}_{y,1}))\vec(\ten{F}_y\times_{k\in[r]\backslash j}\mat{\alpha}_k\times_j\d\mat{\alpha}_j) \\
+    \d l(\mat{\Omega}_j) &= c_2\Big(\langle\ten{X}\times_{k\in[r]\backslash j}\mat{\Omega}_k\times_j\d\mat{\Omega}_j, \ten{X}\rangle - \D b(\mat{\eta}_{y,2})\vec\!\Big(\bigotimes_{k = r}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_j\otimes\bigotimes_{k=j-1}^{1}\mat{\Omega}_k\Big)\Big)
+\end{align*}
+Before rewriting the differentials into a form which reveals the derivatives we compute the second order differentials.
+{\allowdisplaybreaks\begin{align*}
+    \d^2 l(\overline{\ten{\eta}}_1)
+        &= -c_1^2 \t{\d\vec(\overline{\ten{\eta}}_1)}\mat{H}_{1,1}\d\vec(\overline{\ten{\eta}}_1) \\
+    \d^2 l(\overline{\ten{\eta}}_1, \mat{\alpha}_j)
+        &= -c_1^2 \t{\vec(\ten{F}_y\times_{k\in[r]\backslash j}\mat{\alpha}_k\times_j\d\mat{\alpha}_j)}\mat{H}_{1,1}\d\vec(\overline{\ten{\eta}}_1) \\
+    \d^2 l(\overline{\ten{\eta}}_1, \mat{\Omega}_j)
+        &= -c_1 c_2 \t{\vec\!\Big(\bigotimes_{k = r}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_j\otimes\bigotimes_{k=j-1}^{1}\mat{\Omega}_k\Big)}\mat{H}_{2,1}\d\vec(\overline{\ten{\eta}}_1) \\
+    \d^2 l(\mat{\alpha}_j)
+        &= -c_1^2 \t{\vec(\ten{F}_y\times_{k\in[r]\backslash j}\mat{\alpha}_k\times_j\d\mat{\alpha}_j)}\mat{H}_{1,1}\vec(\ten{F}_y\times_{k\in[r]\backslash j}\mat{\alpha}_k\times_j\d\mat{\alpha}_j) \\
+    \d^2 l(\mat{\alpha}_j, \mat{\alpha}_l)
+        &\overset{\makebox[0pt]{\scriptsize $j\neq l$}}{=} -c_1^2 \t{\vec(\ten{F}_y\times_{k\in[r]\backslash l}\mat{\alpha}_k\times_l\d\mat{\alpha}_l)}\mat{H}_{1,1}\vec(\ten{F}_y\times_{k\in[r]\backslash j}\mat{\alpha}_k\times_j\d\mat{\alpha}_j) \\
+        &\qquad + c_1(\t{(\vec{\ten{X}})} - \D b(\mat{\eta}_{y,1}))\vec(\ten{F}_y\times_{k\in[r]\backslash\{j, l\}}\mat{\alpha}_k\times_j\d\mat{\alpha}_j\times_l\d\mat{\alpha}_l) \\
+    \d^2 l(\mat{\alpha}_j, \mat{\Omega}_l)
+        &= -c_1 c_2 \t{\vec\!\Big(\bigotimes_{k = r}^{l + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_l\otimes\bigotimes_{k=l-1}^{1}\mat{\Omega}_k\Big)}\mat{H}_{2,1}\vec(\ten{F}_y\times_{k\in[r]\backslash j}\mat{\alpha}_k\times_j\d\mat{\alpha}_j) \\
+    \d^2 l(\mat{\Omega}_j)
+        &= -c_2^2 \t{\vec\!\Big(\bigotimes_{k = r}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_j\otimes\bigotimes_{k=j-1}^{1}\mat{\Omega}_k\Big)}\mat{H}_{2,2}\vec\!\Big(\bigotimes_{k = r}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_j\otimes\bigotimes_{k=j-1}^{1}\mat{\Omega}_k\Big) \\
+    \d^2 l(\mat{\Omega}_j, \mat{\Omega}_l)
+        &\overset{\makebox[0pt]{\scriptsize $j < l$}}{=} c_2 \langle\ten{X}\times_{k\in[r]\backslash \{j, l\}}\mat{\Omega}_k\times_j\d\mat{\Omega}_j\times_l\d\mat{\Omega}_l, \ten{X}\rangle \\
+        &\qquad - c_2^2 \t{\vec\!\Big(\bigotimes_{k = r}^{l + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_l\otimes\bigotimes_{k=l-1}^{1}\mat{\Omega}_k\Big)}\mat{H}_{2,2}\vec\!\Big(\bigotimes_{k = r}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_j\otimes\bigotimes_{k=j-1}^{1}\mat{\Omega}_k\Big) \\
+        &\qquad\qquad - c_2 \D b(\mat{\eta}_{y,2})\vec\!\Big(\bigotimes_{k = r}^{l + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_{l}\otimes\bigotimes_{k = l - 1}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_{j}\otimes\bigotimes_{k=j-1}^{1}\mat{\Omega}_k\Big)
+\end{align*}}%
+Now we rewrite all the above differentials to extract the derivatives one at a time with extensive usage of the identities from Theorem~\ref{thm:mlm_mat} and Theorem~\ref{thm:mtvk_rearrange}. We start with the first order differentials
+{\allowdisplaybreaks\begin{align*}
+    \d l(\overline{\ten{\eta}}_1) &= c_1\t{\vec(\ten{X} - \ten{D}_1)}\vec{\d\overline{\ten{\eta}}_1} \\
+        &\qquad\Rightarrow \D l(\overline{\ten{\eta}}_1) \equiv c_1 (\ten{X} - \ten{D}_1) \\
+%
+    \d l(\mat{\alpha}_j) &= c_1 \tr((\ten{X} - \ten{D}_1)_{(j)}\t{(\ten{F}_y\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}}\t{\d\mat{\alpha}_j}) \\
+        &\qquad\Rightarrow \D l(\mat{\alpha}_j) \equiv c_1 (\ten{X} - \ten{D}_1)_{(j)}\t{(\ten{F}_y\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}} \\
+%
+    \d l(\mat{\Omega}_j) &= c_2\Big(\langle\ten{X}\times_{k\in[r]\backslash j}\mat{\Omega}_k\times_j\d\mat{\Omega}_j, \ten{X}\rangle - \D b(\mat{\eta}_{y,2})\vec\!\Big(\bigotimes_{k = r}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_j\otimes\bigotimes_{k=j-1}^{1}\mat{\Omega}_k\Big)\Big) \\
+        &= c_2 \t{(\vec{\ten{X}}\otimes\vec{\ten{X}} - (\ten{D}_2)_{([2r])})}\vec\!\Big(\bigotimes_{k = r}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_j\otimes\bigotimes_{k=j-1}^{1}\mat{\Omega}_k\Big) \\
+        &= c_2 (\ten{X}\otimes\ten{X} - \ten{R}_{[2r]}(\ten{D}_2))\mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})}\times_j\t{(\vec{\d\mat{\Omega}_j})} \\
+        &= c_2 \t{\vec\Bigl((\ten{X}\otimes\ten{X} - \ten{R}_{[2r]}(\ten{D}_2))\mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})}\Bigr)}\vec{\d\mat{\Omega}_j} \\
+        &= c_2 \t{\vec\Bigl((\ten{X}\otimes\ten{X} - \ten{R}_{[2r]}(\ten{D}_2))\mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})}\Bigr)}\mat{D}_{p_j}\t{\mat{D}_{p_j}}\vec{\d\mat{\Omega}_j} \\
+            &\qquad\Rightarrow \D l(\mat{\Omega}_j) = c_2 \t{\vec\Bigl((\ten{X}\otimes\ten{X} - \ten{R}_{[2r]}(\ten{D}_2))\mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})}\Bigr)}\mat{D}_{p_j}\t{\mat{D}_{p_j}}
+\end{align*}}%
+The next step is to identify the Hessians from the second differentials in a similar manner as befor.
+{\allowdisplaybreaks\begin{align*}
+    &\d^2 l(\overline{\ten{\eta}}_1) \\
+        &= -c_1^2 \t{(\vec{\d\overline{\ten{\eta}}_1})}\mat{H}_{1,1}\vec{\d\overline{\ten{\eta}}_1} \\
+            &\qquad\Rightarrow \H l(\overline{\ten{\eta}}_1) = -c_1^2 \mat{H}_{1,1}
+            \qquad{\color{gray} (p \times p)}
+        \\
+    &\d^2 l(\overline{\ten{\eta}}_1, \mat{\alpha}_j) \\
+        &= -c_1^2 \t{\vec(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\times_j\d\mat{\alpha}_j)}\mat{H}_{1,1}\vec{\d\overline{\ten{\eta}}_1} \\
+        &= -c_1^2 \t{\vec(\d\mat{\alpha}_j(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)})}\mat{K}_{p,(j)}\mat{H}_{1,1}\vec{\d\overline{\ten{\eta}}_1} \\
+        &= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})}((\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}\otimes\mat{I}_{p_j})(\ten{H}_{1,1})_{((j, [r]\backslash j))}\vec{\d\overline{\ten{\eta}}_1} \\
+        &= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} ( (\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k) \ttt_{[r]\backslash j} \ten{H}_{1,1})_{((2, 1))} \vec{\d\overline{\ten{\eta}}_1} \\
+            &\qquad\Rightarrow \frac{\partial l}{\partial(\vec{\mat{\alpha}_j})\t{\partial(\vec{\overline{\ten{\eta}}_1)}}} = -c_1^2 ( (\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k) \ttt_{[r]\backslash j} \ten{H}_{1,1})_{((2, 1))}
+            \qquad{\color{gray} (p_j q_j \times p)}
+        \\
+    &\d^2 l(\overline{\ten{\eta}}_1, \mat{\Omega}_j) \\
+        &= -c_1 c_2 \t{\vec\!\Big(\bigotimes_{k = r}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_j\otimes\bigotimes_{k=j-1}^{1}\mat{\Omega}_k\Big)}\mat{H}_{2,1}\vec{\d\overline{\ten{\eta}}_1} \\
+        &= -c_1 c_2 \t{\Big[ \t{(\ten{H}_{2,1})_{([2r])}} \vec\!\Big(\bigotimes_{k = r}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_j\otimes\bigotimes_{k=j-1}^{1}\mat{\Omega}_k\Big) \Big]} \vec{\d\overline{\ten{\eta}}_1} \\
+        &= -c_1 c_2 \t{\vec( \ten{R}_{[2r]}(\ten{H}_{2,1}) \mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})}\times_j\t{(\vec{\d\mat{\Omega}_j})} )} \vec{\d\overline{\ten{\eta}}_1} \\
+        &= -c_1 c_2 \t{(\vec{\d\mat{\Omega}_j})} ( \ten{R}_{[2r]}(\ten{H}_{2,1}) \mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})} )_{(j)} \vec{\d\overline{\ten{\eta}}_1} \\
+            &\qquad\Rightarrow \frac{\partial l}{\partial(\vec{\mat{\Omega}_j})\t{\partial(\vec{\overline{\ten{\eta}}_1)}}} = -c_1 c_2 ( \ten{R}_{[2r]}(\ten{H}_{2,1}) \mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})} )_{(j)}
+            \qquad{\color{gray} (p_j^2 \times p)}
+        \\
+    &\d^2 l(\mat{\alpha}_j) \\
+        &= -c_1^2 \t{\vec(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\times_j\d\mat{\alpha}_j)}\mat{H}_{1,1}\vec(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\times_j\d\mat{\alpha}_j) \\
+        &= -c_1^2 \t{\vec(\d\mat{\alpha}_j(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)})}\mat{K}_{\mat{p},(j)}\mat{H}_{1,1}\t{\mat{K}_{\mat{p},(j)}}\vec(\d\mat{\alpha}_j(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}) \\
+        &= -c_1^2 \t{[((\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}\otimes\mat{I}_{p_j})\vec{\d\mat{\alpha}_j}]}\mat{K}_{\mat{p},(j)}\mat{H}_{1,1}\t{\mat{K}_{\mat{p},(j)}}((\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}\otimes\mat{I}_{p_j})\vec{\d\mat{\alpha}_j} \\
+        &= -c_1^2 \t{[((\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}\otimes\mat{I}_{p_j})\vec{\d\mat{\alpha}_j}]}(\ten{H}_{1,1})_{((j,[r]\backslash j),(j,[r]\backslash j))}((\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}\otimes\mat{I}_{p_j})\vec{\d\mat{\alpha}_j} \\
+        &= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})}[ ((\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k)\ttt_{[r]\backslash j}\ten{H}_{1,1})\ttt_{[r]\backslash j + 2,[r]\backslash j}(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k) ]_{((2,1))}\vec{\d\mat{\alpha}_j} \\
+            &\qquad\Rightarrow \H l(\mat{\alpha}_j) = -c_1^2 \Big[ \left(\Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big)\ttt_{[r]\backslash j}\ten{H}_{1,1}\right)\ttt_{[r]\backslash j + 2}^{[r]\backslash j}\Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \Big]_{((2,1))}
+            \qquad{\color{gray} (p_j q_j \times p_j q_j)}
+        \\
+    &\d^2 l(\mat{\alpha}_j, \mat{\alpha}_l) \\
+        &\overset{\makebox[0pt]{\scriptsize $j < l$}}{=} -c_1^2 \t{\vec\Bigl(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\times_j\d\mat{\alpha}_j\Bigr)}\mat{H}_{1,1}\vec\Bigl(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\times_l\d\mat{\alpha}_l\Bigr) \\
+            &\qquad + c_1 (\t{(\vec{\ten{X}})} - \D b(\mat{\eta}_{y,1})) \vec\Bigl(\ten{F}_y\mlm{k\in[r]\backslash\{j,l\}}\mat{\alpha}_k\times_j\d\mat{\alpha}_j\times_l\d\mat{\alpha}_l\Bigr) \\
+        &= -c_1^2 \t{\vec\biggl( \d\mat{\alpha}_j \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big)_{(j)} \biggr)} \mat{K}_{\mat{p},(j)}\mat{H}_{1,1}\t{\mat{K}_{\mat{p},(l)}} \vec\biggl( \d\mat{\alpha}_l \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big)_{(l)} \biggr) \\
+            &\qquad + c_1 (\t{(\vec{\ten{X}})} - \D b(\mat{\eta}_{y,1})) \t{\mat{K}_{\mat{p},((j,l))}} \vec\biggl( (\d\mat{\alpha}_l\otimes\d\mat{\alpha}_j) \Big( \ten{F}_y\mlm{k\in[r]\backslash\{j,l\}}\mat{\alpha}_k \Big)_{((j,l))} \biggr) \\
+        &= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big)_{(j)}\otimes\mat{I}_{p_j} \biggr) \mat{K}_{\mat{p},(j)}\mat{H}_{1,1}\t{\mat{K}_{\mat{p},(l)}} \biggl( \t{\Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big)_{(l)}}\otimes\mat{I}_{p_l} \biggr)\vec{\d\mat{\alpha}_l} \\
+            &\qquad + c_1 (\t{(\vec{\ten{X}})} - \D b(\mat{\eta}_{y,1})) \t{\mat{K}_{\mat{p},((j,l))}} \biggl( \t{\Big( \ten{F}_y\mlm{k\in[r]\backslash\{j,l\}}\mat{\alpha}_k \Big)_{((j,l))}}\otimes\mat{I}_{p_j p_l} \biggr) \vec{(\d\mat{\alpha}_l\otimes\d\mat{\alpha}_j)} \\
+        &= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{(2,1)} \vec{\d\mat{\alpha}_l} \\
+            &\qquad + c_1 \vec\biggl( (\ten{X} - \ten{D}_1) \ttt_{[r]\backslash\{j,l\}} \Big( \ten{F}_y\mlm{k\neq j,l}\mat{\alpha}_k \Big) \biggr) \vec{(\d\mat{\alpha}_l\otimes\d\mat{\alpha}_j)} \\
+        &= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{(2,1)} \vec{\d\mat{\alpha}_l} \\
+            &\qquad + c_1 \t{(\vec{\d\mat{\alpha}_j})} \biggl( (\ten{X} - \ten{D}_1) \ttt_{[r]\backslash\{j,l\}} \Big( \ten{F}_y\mlm{k\neq j,l}\mat{\alpha}_k \Big) \biggr)_{((1,3))} \vec{\d\mat{\alpha}_l} \\
+                &\qquad \begin{aligned}
+                    \Rightarrow \frac{\partial l}{\partial(\vec{\mat{\alpha}_j})\t{\partial(\vec{\mat{\alpha}_l})}} &=
+                            -c_1^2 \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{(2,1)} \\
+                        &\qquad + c_1 \biggl( (\ten{X} - \ten{D}_1) \ttt_{[r]\backslash\{j,l\}} \Big( \ten{F}_y\mlm{k\neq j,l}\mat{\alpha}_k \Big) \biggr)_{((1,3) + [[j > l]])}
+                        \qquad{\color{gray} (p_j q_j \times p_l q_l)}
+                \end{aligned}
+        \\
+    &\d^2 l(\mat{\alpha}_j, \mat{\Omega}_l) \\
+        &= -c_1 c_2 \t{\vec\Bigl(\ten{F}_y\mlm{k\neq j}\mat{\alpha}_k\times_j\d\mat{\alpha}_j\Bigr)} \mat{H}_{1,2} \vec\Bigl(\bigkron{k = r}{l + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_l\otimes\bigkron{k=l-1}{1}\mat{\Omega}_k\Bigr) \\
+        &= -c_1 c_2 \t{\vec\biggl(\d\mat{\alpha}_j\Big(\ten{F}_y\mlm{k\neq j}\mat{\alpha}_k\Big)_{(j)}\biggr)}\mat{K}_{\mat{p},(j)} \t{(\ten{H}_{2,1})_{([2r])}} \vec\Bigl(\bigkron{k = r}{l + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_l\otimes\bigkron{k=l-1}{1}\mat{\Omega}_k\Bigr) \\
+        &= -c_1 c_2 \t{(\vec{\d\mat{\alpha}_j})}\biggl(\t{\Big(\ten{F}_y\mlm{k\neq j}\mat{\alpha}_k\Big)_{(j)}}\otimes\mat{I}_{p_j}\biggr) \mat{K}_{\mat{p},(j)} \vec\Bigl(\ten{R}_{[2r]}(\ten{H}_{2,1})\mlm{k\neq l}\t{(\vec{\mat{\Omega}_k})}\times_l\t{(\vec{\d\mat{\Omega}_l})}\Bigr) \\
+        &= -c_1 c_2 \t{(\vec{\d\mat{\alpha}_j})}\biggl(\t{\Big(\ten{F}_y\mlm{k\neq j}\mat{\alpha}_k\Big)_{(j)}}\otimes\mat{I}_{p_j}\biggr) \mat{K}_{\mat{p},(j)} \t{\Bigl(\ten{R}_{[2r]}(\ten{H}_{2,1})\mlm{k\neq l}\t{(\vec{\mat{\Omega}_k})}\Bigr)_{([r])}}\vec{\d\mat{\Omega}_l} \\
+        &= -c_1 c_2 \t{(\vec{\d\mat{\alpha}_j})}\biggl( \Big(\ten{F}_y\mlm{k\neq j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j}^{[r]\backslash j + r} \Bigl(\ten{R}_{[2r]}(\ten{H}_{2,1})\mlm{k\neq l}\t{(\vec{\mat{\Omega}_k})}\Bigr) \biggr)_{(r + 2, 1)} \vec{\d\mat{\Omega}_l} \\
+            &\qquad\Rightarrow \frac{\partial l}{\partial(\vec{\mat{\alpha}_j})\t{\partial(\vec{\mat{\Omega}_l})}} = -c_1 c_2 \biggl( \Big(\ten{F}_y\mlm{k\neq j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j}^{[r]\backslash j + r} \Bigl(\ten{R}_{[2r]}(\ten{H}_{2,1})\mlm{k\neq l}\t{(\vec{\mat{\Omega}_k})}\Bigr) \biggr)_{(r + 2, 1)}
+            % \qquad {\color{gray} (p_j q_j \times p_l^2)}
+        \\
+    &\d^2 l(\mat{\Omega}_j) \\
+        &= -c_2^2 \t{\vec\Bigl(\bigkron{k = r}{l + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_l\otimes\bigkron{k=l-1}{1}\mat{\Omega}_k\Bigr)} \t{(\ten{H}_{2,2})_{([2r],[2r]+2r)}} \vec\Bigl(\bigkron{k = r}{l + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_l\otimes\bigkron{k=l-1}{1}\mat{\Omega}_k\Bigr) \\
+        &= -c_2^2 \ten{R}_{[2r],[2r]+2r}(\ten{H}_{2,2})\mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})}\mlm{\substack{k + r\\k\in[r]\backslash j}}\t{(\vec{\mat{\Omega}_k})}\times_j\t{(\vec{\d\mat{\Omega}_j})}\times_{j + r}\t{(\vec{\d\mat{\Omega}_j})} \\
+        &= -c_2^2 \t{(\vec{\d\mat{\Omega}_j})} \biggl( \ten{R}_{[2r],[2r]+2r}(\ten{H}_{2,2})\mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})}\mlm{\substack{k + r\\k\in[r]\backslash j}}\t{(\vec{\mat{\Omega}_k})} \biggr)_{([r])} \vec{\d\mat{\Omega}_j} \\
+            &\qquad\Rightarrow \H l(\mat{\Omega}_j) = -c_2^2 \biggl( \ten{R}_{[2r],[2r]+2r}(\ten{H}_{2,2})\mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})}\mlm{\substack{k + r\\k\in[r]\backslash j}}\t{(\vec{\mat{\Omega}_k})} \biggr)_{([r])}
+            \qquad {\color{gray} (p_j^2 \times p_j^2)}
+        \\
+    &\d^2 l(\mat{\Omega}_j, \mat{\Omega}_l) \\
+        &\overset{\makebox[0pt]{\scriptsize $j < l$}}{=} c_2 \langle\ten{X}\times_{k\in[r]\backslash \{j, l\}}\mat{\Omega}_k\times_j\d\mat{\Omega}_j\times_l\d\mat{\Omega}_l, \ten{X}\rangle \\
+        &\qquad - c_2^2 \t{\vec\!\Big(\bigotimes_{k = r}^{l + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_l\otimes\bigotimes_{k=l-1}^{1}\mat{\Omega}_k\Big)}\mat{H}_{2,2}\vec\!\Big(\bigotimes_{k = r}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_j\otimes\bigotimes_{k=j-1}^{1}\mat{\Omega}_k\Big) \\
+        &\qquad\qquad - c_2 \D b(\mat{\eta}_{y,2})\vec\!\Big(\bigotimes_{k = r}^{l + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_{l}\otimes\bigotimes_{k = l - 1}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_{j}\otimes\bigotimes_{k=j-1}^{1}\mat{\Omega}_k\Big) \\
+        &= c_2 \t{(\vec{\ten{X}}\otimes\vec{\ten{X}} - (\ten{D}_2)_{([2r])})} \vec\Bigl(\bigotimes_{k = r}^{l + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_{l}\otimes\bigotimes_{k = l - 1}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_{j}\otimes\bigotimes_{k=j-1}^{1}\mat{\Omega}_k\Bigr) \\
+        &\qquad - c_2^2 \t{\vec\!\Big(\bigotimes_{k = r}^{l + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_l\otimes\bigotimes_{k=l-1}^{1}\mat{\Omega}_k\Big)}\t{(\ten{H}_{2,2})_{([2r],[2r]+2r)}}\vec\!\Big(\bigotimes_{k = r}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_j\otimes\bigotimes_{k=j-1}^{1}\mat{\Omega}_k\Big) \\
+        &= c_2 (\ten{X}\otimes\ten{X} - \ten{R}_{[2r]}(\ten{D}_2)) \mlm{k\neq j,l}\t{(\vec{\mat{\Omega}_k})} \times_j \t{(\vec{\d\mat{\Omega}_j})} \times_l \t{(\vec{\d\mat{\Omega}_l})} \\
+        &\qquad - c_2^2 \ten{R}_{([2r],[2r]+2r)}(\ten{H}_{2,2}) \mlm{k\in [r]\backslash j}\t{(\vec{\mat{\Omega}_k})} \mlm{\substack{k + r \\ k\in [r]\backslash l}}\t{(\vec{\mat{\Omega}_k})} \times_j \t{(\vec{\d\mat{\Omega}_j})} \times_l \t{(\vec{\d\mat{\Omega}_l})} \\
+        &= c_2 \t{(\vec{\d\mat{\Omega}_j})}\Big((\ten{X}\otimes\ten{X} - \ten{R}_{[2r]}(\ten{D}_2)) \mlm{k\neq j,l}\t{(\vec{\mat{\Omega}_k})} \Big)_{(j)}\vec{\d\mat{\Omega}_l} \\
+        &\qquad - c_2^2 \t{(\vec{\d\mat{\Omega}_j})}\Big(\ten{R}_{([2r],[2r]+2r)}(\ten{H}_{2,2}) \mlm{k\in [r]\backslash j}\t{(\vec{\mat{\Omega}_k})} \mlm{\substack{k + r \\ k\in [r]\backslash l}}\t{(\vec{\mat{\Omega}_k})}\Big)_{(j)}\vec{\d\mat{\Omega}_l} \\
+            &\qquad \begin{aligned}\Rightarrow \frac{\partial l}{\partial(\vec{\mat{\Omega}_j})\t{\partial(\vec{\mat{\Omega}_l})}} &=
+                c_2 \Big((\ten{X}\otimes\ten{X} - \ten{R}_{[2r]}(\ten{D}_2)) \mlm{k\neq j,l}\t{(\vec{\mat{\Omega}_k})} \Big)_{(j)} \\
+                    &\qquad -c_2^2 \Big(\ten{R}_{([2r],[2r]+2r)}(\ten{H}_{2,2}) \mlm{k\in [r]\backslash j}\t{(\vec{\mat{\Omega}_k})} \mlm{\substack{k + r \\ k\in [r]\backslash l}}\t{(\vec{\mat{\Omega}_k})}\Big)_{(j)}
+        % \qquad {\color{gray} (p_j^2 \times p_l^2)}
+        \end{aligned}
+\end{align*}}%
+This now concludes the computation of the observed information matrix $\mathcal{J}(\mat{\Theta})$ as all blocks (or its transposed counterpart) are computed. To get to the Fisher information we now continue by computing the expectation of the individual blocks $\mathcal{I}_{i,j} = \E[\mathcal{J}_{i,j} \mid \ten{Y} = y]$. Taking a closer look at the individual blocks shows that only a few of them are random as only $\frac{\partial l}{\partial(\vec{\mat{\alpha}_j})\t{\partial(\vec{\mat{\alpha}_l})}}$ and $\frac{\partial l}{\partial(\vec{\mat{\Omega}_j})\t{\partial(\vec{\mat{\Omega}_l})}}$ for $j \neq l$ depend on $\ten{X}$. This means only for those the expectation needs to be calculated because all other blocks are constant with regard to the conditional expectation. Therefore, we start by compute the expectation of some subexpressions encountered in these blocks.
+\begin{displaymath}
+    \E[\ten{X}\mid\ten{Y} = y] \equiv \vec{\E[\ten{X}\mid\ten{Y} = y]} = \E[\mat{t}_1(\ten{X})\mid\ten{Y} = y] = \t{\D b(\mat{\eta}_{y,1})} \equiv \ten{D}_1.
+\end{displaymath}
+From this we get $\E[\ten{X}\mid\ten{Y} = y] = \ten{D}_1$ because there shapes are identical. Next considure
+\begin{displaymath}
+    \ten{X}\otimes\ten{X}
+        = \ten{R}_{[2r]}(\reshape{(\mat{p}, \mat{p})}(\vec{\ten{X}}\otimes\vec{\ten{X}}))
+        = \ten{R}_{[2r]}(\reshape{(\mat{p}, \mat{p})}(\mat{t}_2(\ten{X}))
+\end{displaymath}
+By the linearity of the expectation, rearranging and reshaping we get
+\begin{displaymath}
+    \E[\ten{X}\otimes\ten{X} \mid \ten{Y} = y]
+        = \ten{R}_{[2r]}(\reshape{(\mat{p}, \mat{p})}(\E[\mat{t}_2(\ten{X})\mid\ten{Y} = y])
+        = \ten{R}_{[2r]}(\reshape{(\mat{p}, \mat{p})}(\D b(\mat{\eta}_{y,2}))
+        = \ten{R}_{[2r]}(\ten{D}_2)
+\end{displaymath}
+and with that all auxiliary calculation are done. Using the two expectations yields the blocks of the Fisher information
+\begin{displaymath}
+    \mathcal{I}(\mat{\Theta}) = \E[\mathcal{J}(\mat{\Theta})\mid\ten{Y} = y] = \begin{pmatrix}
+        \mathcal{I}_{1,1} &\mathcal{I}_{2,1} & \cdots & \mathcal{I}_{2 r + 1, 1} \\
+        \vdots & \vdots & \ddots & \vdots \\
+        \mathcal{I}_{2 r + 1,1} & \mathcal{I}_{2 r + 1,1} & \cdots & \mathcal{I}_{2 r + 1, 2 r + 1}
+    \end{pmatrix}
+\end{displaymath}
+for $j, l = 1, ..., r$ as follows
+\begin{align*}
+    \mathcal{I}_{1,1} &= c_1^2 (\ten{H}_{1,1})_{([r])} \\
+    \mathcal{I}_{1,j+1} = \t{\mathcal{I}_{j+1,1}}
+        &= c_1^2 \Big[\Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1}\Big]_{((2, 1))} \\
+    \mathcal{I}_{1,j+1} = \t{\mathcal{I}_{j+1,1}}
+        &= c_1 c_2 \Big( \ten{R}_{[2r]}(\ten{H}_{2,1}) \mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})} \Big)_{(j)} \\
+    \mathcal{I}_{j+1,l+1} = \t{\mathcal{I}_{l+1,j+1}}
+        &= c_1^2 \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{((2,1))} \\
+    \mathcal{I}_{j+1,l+r+1} = \t{\mathcal{I}_{l+r+1,j+1}}
+        &= c_1 c_2 \biggl( \Big(\ten{F}_y\mlm{k\neq j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j}^{[r]\backslash j + r} \Bigl(\ten{R}_{[2r]}(\ten{H}_{2,1})\mlm{k\neq l}\t{(\vec{\mat{\Omega}_k})}\Bigr) \biggr)_{((r + 2, 1))} \\
+    \mathcal{I}_{j+r+1,l+r+1} = \t{\mathcal{I}_{l+r+1,j+r+1}}
+        &= c_2^2 \Big(\ten{R}_{([2r],[2r]+2r)}(\ten{H}_{2,2}) \mlm{k\in [r]\backslash j}\t{(\vec{\mat{\Omega}_k})} \mlm{\substack{k + r \\ k\in [r]\backslash l}}\t{(\vec{\mat{\Omega}_k})}\Big)_{(j)}
+\end{align*}
+
+
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
                   \section{Vectorization and Matricization}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+The \emph{matricization} is a generalization of the \emph{vectorization} operation, the \emph{mode product} generalizes the matrix matrix product which is used in the \emph{multi linar multiplication}. In this section we provide some relations between these operations in conjunction with the Kronecker product.
+
+\begin{theorem}\label{thm:mlm_mat}
+    Let $\ten{A}$ be a tensor of order $r$ with the dimensions $q_1\times ... \times q_r$. Furthermore, let for $k = 1, ..., r$ be $\mat{B}_k$ matrices of dimensions $p_k\times q_k$. Then, for any $(\mat{i}, \mat{j})\in\perm{r}$ holds
+    \begin{displaymath}
+        (\ten{A}\times_{k\in[r]}\mat{B}_k)_{(\mat{i}, \mat{j})}
+            = \Big(\bigotimes_{k = \len{\mat{i}}}^{1}\mat{B}_{\mat{i}_k}\Big) \ten{A}_{(\mat{i}, \mat{j})} \Big(\bigotimes_{k = \len{\mat{j}}}^{1}\t{\mat{B}_{\mat{j}_k}}\Big).
+    \end{displaymath}
+\end{theorem}
+
+A well known special case of Theorem~\ref{thm:mlm_mat} is the relation between vectorization and the Kronecker product
 \begin{displaymath}
-    \vec(\ten{A}\times_{k\in[r]}\mat{B}_k) = \Big(\bigotimes_{k = r}^1 \mat{B}_k\Big)\vec\ten{A}
+    \vec(\mat{B}_1\mat{A}\t{\mat{B}_2}) = (\mat{B}_2\otimes\mat{B}_1)\vec{A}.
 \end{displaymath}
-\begin{displaymath}
-    (\ten{A}\times_{k\in[r]}\mat{B}_k)_{(j)} = \mat{B}_j\ten{A}_{(j)}\bigotimes_{\substack{k = r\\k\neq j}}^1\t{\mat{B}_k}
-\end{displaymath}
-of which a special case is $(\ten{A}\times_{j}\mat{B}_j)_{(j)} = \mat{B}_j\ten{A}_{(j)}$.
+Here we have a matrix, a.k.a. an order 2 tensor, and the vectorization as a special case of the matricization $\vec{\mat{A}} = \mat{A}_{((1, 2))}$ with $(\mat{i}, \mat{j}) = ((1, 2), ())\in\perm{2}$. Note that the empty Kronecker product is $1$ by convention.
+
+Before we state another identity of interest which arises in matrix calculus we define a new operation similar in nature to the rearrangement operation in \cite{ApproxKron-VanLoanPitsianis1993}.
+\begin{defn}
+    Let $\ten{A}$ be a $2 r + 2 s + t$ tensor where $r > 0$ and $s, t \geq 0$ of dimensions $q_1\times ... \times q_{2 r + 2 s + t}$. Furthermore, let $(\mat{i}, \mat{j}, \mat{k})\in\perm{2 r + 2 s + t}$ such that $\len{\mat{i}} = 2 r$ and $\len{\mat{j}} = 2 s$ with which the operation $\ten{R}_{\mat{i}, \mat{j}}$ is defined as
+    \begin{displaymath}
+        \ten{R}_{\mat{i}, \mat{j}}(\ten{A}) = \reshape{\mat{d}}(\ten{A}_{((\pi(\mat{i}), \rho(\mat{j})))})
+    \end{displaymath}
+    where both $\pi, \rho$ are \emph{inverse perfect outer shuffles} and the new dimension $\mat{d}$ is given by
+    \begin{displaymath}
+        \mat{d} = (q_{\mat{i}_1} q_{\mat{i}_{r + 1}}, ..., q_{\mat{i}_{r}} q_{\mat{i}_{2 r}}, q_{\mat{j}_1} q_{\mat{j}_{s + 1}}, ..., q_{\mat{j}_s} q_{\mat{j}_{2 s}}, q_{2 r + 2 s + 1}, ..., q_{2 r + 2 s + t}).
+    \end{displaymath}
+    In the case of $s = 0$ we drop the second index and write $\ten{R}_{\mat{i}}$ cause $\mat{j}$ is empty.
+\end{defn}
+The operation $\ten{R}_{\mat{i}}(\ten{A})$ results in a tensor of order $r + s$ as the reshape operation collapses pairs of consecutive axis after shuffling. Similar the two index version leads to a tensor of order $r + s + t$.
+
+\begin{center}
+    \begin{tikzpicture}[>=latex]
+        \foreach \t in {1,...,13} {
+            \node at (\t, 2) {$q_{\t}$};
+        }
+        % \foreach \card [count=\y, evaluate=\y as \h using {\y / 3.5}] in {4,...,2,A} {
+        \foreach \t [count=\f] in {1, 3, 5, 2, 4, 6} {
+            \node[label=above:$\mat{i}_\f$] (f) at (\f,  1) {$\f$};
+            \node[label=below:$\pi(\mat{i})_\t$] (t) at (\t, -1) {$\f$};
+            \draw[->] (f) to[out=270, in=90] (t);
+        }
+        \begin{scope}[xshift=6cm]
+            \foreach \t [count=\f, evaluate=\f as \ff using {int(\f + 6)}] in {1, 3, 2, 4} {
+                \node[label=above:$\mat{j}_\f$] (f) at (\f,  1) {$\ff$};
+                \node[label=below:$\sigma(\mat{j})_\t$] (t) at (\t, -1) {$\ff$};
+                \draw[->] (f) to[out=270, in=90] (t);
+            }
+        \end{scope}
+        \begin{scope}[xshift=10cm]
+            \foreach \t [count=\f, evaluate=\f as \ff using {int(\f + 10)}] in {1, 2, 3} {
+                \node[label=above:$\mat{k}_\f$] (f) at (\f,  1) {$\ff$};
+                \node[label=below:$\mat{k}_\t$] (t) at (\t, -1) {$\ff$};
+                \draw[->] (f) to[out=270, in=90] (t);
+            }
+        \end{scope}
+        \foreach \t [count=\f] in {1,4,2,5,3,6,7,9,8,10,11,12,13} {
+            \node (d-\f) at (\f, -2) {$q_{\t}$};
+        }
+        \foreach \l\r [count=\f] in {1/4,2/5,3/6,7/9,8/10} {
+            \node[label=below:$d_{\f}$] (e-\f) at ({2 * \f - 0.5}, -4) {$q_{\l} q_{\r}$};
+        }
+        \foreach \t [count=\f from 6] in {11,12,13} {
+            \node[label=below:$d_{\f}$] (e-\t) at (\t, -4) {$q_{\t}$};
+        }
+        \draw[->] (d-1) to[out=270, in=90] (e-1); \draw (d-2) to[out=270, in=90] (e-1);
+        \draw[->] (d-3) to[out=270, in=90] (e-2); \draw (d-4) to[out=270, in=90] (e-2);
+        \draw[->] (d-5) to[out=270, in=90] (e-3); \draw (d-6) to[out=270, in=90] (e-3);
+        \draw[->] (d-7) to[out=270, in=90] (e-4); \draw (d-8) to[out=270, in=90] (e-4);
+        \draw[->] (d-9) to[out=270, in=90] (e-5); \draw (d-10) to[out=270, in=90] (e-5);
+        \draw[->] (d-11) -- (e-11);
+        \draw[->] (d-12) -- (e-12);
+        \draw[->] (d-13) -- (e-13);
+
+        \node[anchor=east] at (0.5, 2) {Dims. $\ten{A}$};
+        \node[anchor=east] at (0.5, 1) {Modes};
+        \node[anchor=east] at (0.5, -1) {Perm. Modes};
+        \node[anchor=east] at (0.5, -2) {Perm. Dims.};
+        \node[anchor=east] at (0.5, -4) {Dims. $\ten{R}_{\mat{i}, \mat{j}}(\ten{A})$};
+    \end{tikzpicture}
+\end{center}
+
+\begin{theorem}\label{thm:mtvk_rearrange}
+    Let $\ten{A}$ be a $2 r + s$ tensor where $r > 0$ and $s \geq 0$ of dimensions $q_1\times ... \times q_{2 r + s}$. Furthermore, let $(\mat{i}, \mat{j})\in\perm{2 r + s}$ such that $\len{\mat{i}} = 2 r$ and for $k = 1, ..., r$ denote with $\mat{B}_k$ matrices of dimensions $q_{\mat{i}_{k}}\times q_{\mat{i}_{r + k}}$, then
+    \begin{displaymath}
+        \t{\ten{A}_{(\mat{i})}}\vec{\bigotimes_{k = r}^{1}}\mat{B}_k
+            \equiv \ten{R}_{\mat{i}}(\ten{A})\times_{k\in[r]}\t{(\vec{\mat{B}_k})}.
+    \end{displaymath}
+\end{theorem}
+
+\todo{continue}
 
 
-Let $\ten{A}$ be a $n\times p_1\times ... \times p_r\times q_1\times ... \times q_r$ tensor and $\mat{B}_k$ be $p_k\times q_k$ matrices, then
-\begin{displaymath}
-    \ten{A}_{(1)} \vec{\bigotimes_{k = r}^{1}\mat{B}_k}
-    =
-    \Big(\ten{R}(\ten{A})\times_{\substack{k + 1\\k\in[r]}}\t{\vec(\mat{B}_k)}\Big)_{(1)}
-\end{displaymath}
-where $\ten{R}$ is a permutation of the axis and reshaping of the tensor $\ten{A}$. This axis permutation converts $n\times p_1\times ... \times p_r\times q_1\times ... \times q_r$ to $n\times p_1\times q_1 \times ... \times p_r\times q_r$ and the reshaping vectorizes the axis pairs $p_k\times q_k$ leading to a tensor $\ten{R}(\ten{A})$ of dimensions $n\times p_1 q_1\times ...\times p_r q_r$.
+% Next we define a specific axis permutation and reshaping operation on tensors which will be helpful in expressing some derivatives. Let $\ten{A}$ be a $2 r + s$ tensor with $r > 0$ and $s\geq 0$ of dimensions $p_1\times ... \times p_{2 r + s}$. Furthermore, let $(\mat{i}, \mat{j})\in\perm{2 r + s}$ such that $\len{\mat{i}} = 2 r$. The operation $\ten{R}_{\mat{i}}$ is defined as
+% \begin{displaymath}
+%     \ten{R}_{\mat{i}} = \reshape{(p_1 p_{r + 1}, ..., p_r p_{2 r}, p_{2 r + 1}, ..., p_{r + s})}(\ten{A}_{(\pi(\mat{i}))})
+% \end{displaymath}
 
-An alternative way to write this is for each of the $i\in[n]$ vector components is
-\begin{displaymath}
-    \Big(\ten{A}_{(1)}\vec{\bigotimes_{k = r}^{1}\mat{B}_k}\Big)_{i}
-        = \sum_{J\in[(\mat{p}, \mat{q})]}
-            \ten{A}_{i, J}\prod_{k = 1}^r (B_k)_{J_k, J_{k + r}}
-\end{displaymath}
-using the notation $J\in[(\mat{p}, \mat{q})] = [p_1]\times ... \times [p_r]\times [q_1]\times ... \times [q_r]$.
+% The operation $\ten{R}_{\mat{i}}$ is now defined in two steps. First, the axis of $\ten{A}$ are permuted by $(\pi(\mat{i}), \mat{j})$ where $\pi(\mat{i})$ is the perfect outer shuffle of $\mat{i}$. Second, the first $2 r$ axis are collapsed into $r$ axis via reshaping by vectorizing consecutive axis pairs. \todo{Explane, define (understandable)!} Using the operation $\ten{R}_{\mat{i}}$ we get the relation
+% \begin{displaymath}
+%     \t{\ten{A}_{(\mat{i})}}\vec\bigotimes_{k = r}^{1}\mat{B}_k \equiv \ten{R}_{\mat{i}}(\ten{A})\times_{k\in[r]}\vec{\mat{B}_k}.
+% \end{displaymath}
+
+
+
+
+
+% Even more general, let $(\mat{i}, \mat{j})\in\perm{r}$ and $\ten{A}$ an order $r$ tensor, then
+% \begin{displaymath}
+%     (\ten{A}\times_{k\in[r]}\mat{B}_k)_{(\mat{i}, \mat{j})} = \Big(\bigotimes_{k = \len{\mat{i}}}^{1}\mat{B}_{\mat{i}_k}\Big)
+%         \ten{A}_{(\mat{i})}\Big(\bigotimes_{k = \len{\mat{j}}}^{1}\t{\mat{B}_{\mat{j}_k}}\Big).
+% \end{displaymath}
+
+% \begin{displaymath}
+%     \vec(\ten{A}\times_{k\in[r]}\mat{B}_k) = \Big(\bigotimes_{k = r}^1 \mat{B}_k\Big)\vec\ten{A}
+% \end{displaymath}
+% \begin{displaymath}
+%     (\ten{A}\times_{k\in[r]}\mat{B}_k)_{(j)} = \mat{B}_j\ten{A}_{(j)}\bigotimes_{\substack{k = r\\k\neq j}}^1\t{\mat{B}_k}
+% \end{displaymath}
+% of which a special case is $(\ten{A}\times_{j}\mat{B}_j)_{(j)} = \mat{B}_j\ten{A}_{(j)}$.
+
+% Even more general, let $(\mat{i}, \mat{j})\in\perm{r}$ and $\ten{A}$ an order $r$ tensor, then
+% \begin{displaymath}
+%     (\ten{A}\times_{k\in[r]}\mat{B}_k)_{(\mat{i}, \mat{j})} = \Big(\bigotimes_{k = \len{\mat{i}}}^{1}\mat{B}_{\mat{i}_k}\Big)
+%         \ten{A}_{(\mat{i})}\Big(\bigotimes_{k = \len{\mat{j}}}^{1}\t{\mat{B}_{\mat{j}_k}}\Big).
+% \end{displaymath}
+% For example, let $\ten{A}$ be of order $6$ and $\mat{i} = (1, 3, 2, 4)$, $\mat{j} = (5, 6)$, then
+% \begin{displaymath}
+%     (\ten{A}\times_{k\in[r]}\mat{B}_k)_{((1, 3, 2, 4), (5, 6))} = (\mat{B}_4\otimes\mat{B}_2\otimes\mat{B}_3\otimes\mat{B}_1)\ten{A}_{((1, 3, 2, 4), (5, 6))}(\t{\mat{B}_6}\otimes\t{\mat{B}_5}).
+% \end{displaymath}
+% where $\ttt$ is the tensor times tensor product.
+
+% Another useful identity for two tensors $\ten{A}, \ten{B}$ of dimensions $p_1\times ... \times p_{j-1}\times q_j\times p_{j+1}\times p_r$ and $p_1\times ... \times p_{k-1}\times q_k\times p_{k+1}\times p_r$, respectively. Furthermore, we have an order $2 r$ tensor $\ten{H}$ of dimensions $(\mat{p}, \mat{p})$ and two matrices $\mat{C}, \mat{D}$ of dimensions $p_j\times q_j$ and $p_k\times q_k$, then
+% \begin{align*}
+%     \t{\vec(\ten{A}\times_j\mat{C})}&\ten{H}_{([r])}\vec(\ten{B}\times_k\mat{D}) \\
+%         &= \t{(\vec{\mat{C}})}((\ten{H}\ttt_{[r]\backslash j}\ten{A})\ttt_{([r]\backslash k) + 1, [r]\backslash k}\ten{B})_{(1, 3)}\vec{\mat{D}} \\
+%         &= \t{(\vec{\mat{C}})}((\ten{H}\ttt_{([r]\backslash k) + r, [r]\backslash k}\ten{B})\ttt_{[r]\backslash k}\ten{A})_{(1, 4)}\vec{\mat{D}} \\
+%         &= \t{(\vec{\mat{C}})}  \reshape{(p_j q_j, p_k q_k)}(\reshape{(p_j, \mat{p}, q_j)}(\ten{H}_{(j, [r] + r)}\t{\ten{A}_{(j)}})_{(1, r + 2, k + 1)}\t{\ten{B}_{(k)}})  \vec{\mat{D}}
+% \end{align*}
+
+% Let $\ten{A}$ be an order $r$ tensor and $\mat{B}$ of dimensions $p_j\times q$ a matching matrix such that $\ten{A}\times_j\mat{B}$ is of dimensions $p_1\times...\times p_r$. Furthermore, let $\ten{H}$ be a tensor of dimensions $p_1\times...\times p_r\times d_1\times ... \times d_s$ which makes it an order $r + s$ tensor, then
+% \begin{displaymath}
+%     \t{\vec(\ten{A}\times_j\mat{B})}\ten{H}_{([r])} = \t{(\vec{\mat{B}})}(\ten{H}\ttt_{[r]\backslash j}\ten{A})_{(1, s + 2)}.
+% \end{displaymath}
+
+% Next we define a specific axis permutation and reshaping operation on tensors which will be helpful in expressing some derivatives. Let $\ten{A}$ be a $2 r + s$ tensor $\ten{A}$ with $r > 0$ and $s\geq 0$ of dimensions $p_1\times ... \times p_{2 r + s}$. Furthermore, let $(\mat{i}, \mat{j})\in\perm{2 r + s}$ such that $\len{\mat{i}} = 2 r$, $\len{\mat{j}} = s$ and $\mat{j}$ also sorted. The operation $\ten{R}_{\mat{i}}$ is now defined in two steps. First, the axis of $\ten{A}$ are permuted by $(\pi(\mat{i}), \mat{j})$ where $\pi(\mat{i})$ is the perfect outer shuffle of $\mat{i}$. Second, the first $2 r$ axis are collapsed into $r$ axis via reshaping by vectorizing consecutive axis pairs. \todo{Explane, define (understandable)!} Using the operation $\ten{R}_{\mat{i}}$ we get the relation
+% \begin{displaymath}
+%     \t{\ten{A}_{(\mat{i})}}\vec\bigotimes_{k = r}^{1}\mat{B}_k \equiv \ten{R}_{\mat{i}}(\ten{A})\times_{k\in[r]}\vec{\mat{B}_k}.
+% \end{displaymath}
+
+\begin{center}
+    \begin{tikzpicture}[>=latex, x=1cm, y=-1cm, z=0.5cm, scale = 1.3]
+        \tikzstyle{axis} = [
+            ->,
+            color = gray,
+            shorten <=3pt
+        ];
+        \tikzstyle{number} = [
+            pos = 0.5,
+            text = black,
+            fill = white
+        ];
+
+        \begin{scope}[xshift=0cm]
+            \node[anchor=south] at (1, -0.5) {$\ten{A}_{([4])}$};
+            \node[anchor=north, text=gray] at (1, -0.5) {\footnotesize Matrix};
+
+            \draw[axis] (0, 0) -- +(0, 1) node[number] {$1$};
+            \draw[axis] (0, 1) -- +(0, 1) node[number] {$2$};
+            \draw[axis] (0, 2) -- +(0, 1) node[number] {$3$};
+            \draw[axis] (0, 3) -- +(0, 1) node[number] {$4$};
+            \draw[axis] (0, 0) -- +(1, 0) node[number] {$5$};
+            \draw[axis] (1, 0) -- +(1, 0) node[number] {$6$};
+
+            \draw (.1, .1) rectangle (1.9, 3.9);
+
+            \begin{scope}[xshift=-.5cm, yshift=-8cm]
+                \node[anchor=south] at (1.5, -1.5) {$\ten{R}_{([4])}(\ten{A})$};
+                \node[anchor=north, text=gray] at (1.5, -1.5) {\footnotesize 3D Tensor};
+
+                \draw[axis] (0, 0, 0) -- (0, 1, 0) node[number] {$1$};
+                \draw[axis] (0, 1, 0) -- (0, 2, 0) node[number] {$3$};
+                \draw[axis] (0, 0, 0) -- (1, 0, 0) node[number] {$2$};
+                \draw[axis] (1, 0, 0) -- (2, 0, 0) node[number] {$4$};
+                \draw[axis] (0, 0, 0) -- (0, 0, 1) node[number] {$5$};
+                \draw[axis] (0, 0, 1) -- (0, 0, 2) node[number] {$6$};
+
+                \draw (.1, .1) rectangle (1.9, 1.9);
+                \draw (.1, .1) -- ++(0, 0, 1.8) -- ++(1.8, 0) -- (1.9, .1);
+                \draw (1.9, .1, 1.8) -- ++(0, 1.8) -- (1.9, 1.9);
+            \end{scope}
+        \end{scope}
+        \begin{scope}[xshift=3cm]
+            \node[anchor=south] at (1, -0.5) {$\ten{A}_{([6])}$};
+            \node[anchor=north, text=gray] at (1, -0.5) {\footnotesize Vector};
+
+            \draw[axis] (0, 0) -- +(0, 1) node[number] {$1$};
+            \draw[axis] (0, 1) -- +(0, 1) node[number] {$2$};
+            \draw[axis] (0, 2) -- +(0, 1) node[number] {$3$};
+            \draw[axis] (0, 3) -- +(0, 1) node[number] {$4$};
+            \draw[axis] (0, 4) -- +(0, 1) node[number] {$5$};
+            \draw[axis] (0, 5) -- +(0, 1) node[number] {$6$};
+
+            \draw (.1, .1) rectangle (.3, 5.9);
+
+            \begin{scope}[xshift=-.5cm, yshift=-8cm]
+                \node[anchor=south] at (1.5, -1.5) {$\ten{R}_{([6])}(\ten{A})$};
+                \node[anchor=north, text=gray] at (1.5, -1.5) {\footnotesize 3D Tensor};
+
+                \draw[axis] (0, 0, 0) -- (0, 1, 0) node[number] {$1$};
+                \draw[axis] (0, 1, 0) -- (0, 2, 0) node[number] {$4$};
+                \draw[axis] (0, 0, 0) -- (1, 0, 0) node[number] {$2$};
+                \draw[axis] (1, 0, 0) -- (2, 0, 0) node[number] {$5$};
+                \draw[axis] (0, 0, 0) -- (0, 0, 1) node[number] {$3$};
+                \draw[axis] (0, 0, 1) -- (0, 0, 2) node[number] {$6$};
+
+                \draw (.1, .1) rectangle (1.9, 1.9);
+                \draw (.1, .1) -- ++(0, 0, 1.8) -- ++(1.8, 0) -- (1.9, .1);
+                \draw (1.9, .1, 1.8) -- ++(0, 1.8) -- (1.9, 1.9);
+            \end{scope}
+        \end{scope}
+        \begin{scope}[xshift=6cm]
+            \node[anchor=south] at (1, -0.5) {$\ten{A}_{((1, 5, 2, 6))}$};
+            \node[anchor=north, text=gray] at (1, -0.5) {\footnotesize Matrix};
+
+            \draw[axis] (0, 0) -- +(0, 1) node[number] {$1$};
+            \draw[axis] (0, 1) -- +(0, 1) node[number] {$5$};
+            \draw[axis] (0, 2) -- +(0, 1) node[number] {$2$};
+            \draw[axis] (0, 3) -- +(0, 1) node[number] {$6$};
+            \draw[axis] (0, 0) -- +(1, 0) node[number] {$3$};
+            \draw[axis] (1, 0) -- +(1, 0) node[number] {$4$};
+
+            \draw (.1, .1) rectangle (1.9, 3.9);
+
+            \begin{scope}[xshift=-.5cm, yshift=-8cm]
+                \node[anchor=south] at (1.5, -1.5) {$\ten{R}_{((1, 5, 2, 6))}(\ten{A})$};
+                \node[anchor=north, text=gray] at (1.5, -1.5) {\footnotesize 3D Tensor};
+
+                \draw[axis] (0, 0, 0) -- (0, 1, 0) node[number] {$1$};
+                \draw[axis] (0, 1, 0) -- (0, 2, 0) node[number] {$2$};
+                \draw[axis] (0, 0, 0) -- (1, 0, 0) node[number] {$5$};
+                \draw[axis] (1, 0, 0) -- (2, 0, 0) node[number] {$6$};
+                \draw[axis] (0, 0, 0) -- (0, 0, 1) node[number] {$3$};
+                \draw[axis] (0, 0, 1) -- (0, 0, 2) node[number] {$4$};
+
+                \draw (.1, .1) rectangle (1.9, 1.9);
+                \draw (.1, .1) -- ++(0, 0, 1.8) -- ++(1.8, 0) -- (1.9, .1);
+                \draw (1.9, .1, 1.8) -- ++(0, 1.8) -- (1.9, 1.9);
+            \end{scope}
+        \end{scope}
+    \end{tikzpicture}
+\end{center}
+
+\newcommand{\somedrawing} {
+    \coordinate (a) at (-2,-2,-2);
+    \coordinate (b) at (-2,-2,2);
+    \coordinate (c) at (-2,2,-2);
+    \coordinate (d) at (-2,2,2);
+    \coordinate (e) at (2,-2,-2);
+    \coordinate (f) at (2,-2,2);
+    \coordinate (g) at (2,2,-2);
+    \coordinate (h) at (2,2,2);
+    \draw (a)--(b) (a)--(c) (a)--(e) (b)--(d) (b)--(f) (c)--(d) (c)--(g) (d)--(h) (e)--(f) (e)--(g) (f)--(h) (g)--(h);
+    \fill (a) circle (0.1cm);
+    \fill (d) ++(0.1cm,0.1cm) rectangle ++(-0.2cm,-0.2cm);
+}
+
+% \begin{figure}[ht!]
+%     \centering
+%     \begin{tikzpicture}[scale = 2, x = 1cm, y = 1cm, z = 0.5cm]
+%         \begin{scope}
+%             \foreach \y in {8,...,5} {
+%                 \draw[canvas is xz plane at y={-\y / 5}, fill = gray!30]
+%                 (0, 0) node[anchor = east] {$\y$} rectangle (1, 1)
+%                 node[pos = 0.5, transform shape, scale = 2] {$\y$};
+%             }
+%             \foreach \y in {4,...,1} {
+%                 \draw[canvas is xz plane at y={-\y / 5}, fill = white]
+%                 (0, 0) node[anchor = east] {$\y$} rectangle (1, 1)
+%                 node[pos = 0.5, transform shape, scale = 2] {$\y$};
+%             }
+%         \end{scope}
+
+%         \begin{scope}[xshift = 3cm]
+%             \draw[canvas is xz plane at y={-8 / 5}, fill = gray!30] (0, 0) node[anchor = east] {$8$} rectangle (1, 1) node[pos = 0.5, transform shape, scale = 2] {$8$};
+%             \draw[canvas is xz plane at y={-7 / 5}, fill = white]   (0, 0) node[anchor = east] {$4$} rectangle (1, 1) node[pos = 0.5, transform shape, scale = 2] {$4$};
+%             \draw[canvas is xz plane at y={-6 / 5}, fill = gray!30] (0, 0) node[anchor = east] {$7$} rectangle (1, 1) node[pos = 0.5, transform shape, scale = 2] {$7$};
+%             \draw[canvas is xz plane at y={-5 / 5}, fill = white]   (0, 0) node[anchor = east] {$3$} rectangle (1, 1) node[pos = 0.5, transform shape, scale = 2] {$3$};
+%             \draw[canvas is xz plane at y={-4 / 5}, fill = gray!30] (0, 0) node[anchor = east] {$6$} rectangle (1, 1) node[pos = 0.5, transform shape, scale = 2] {$6$};
+%             \draw[canvas is xz plane at y={-3 / 5}, fill = white]   (0, 0) node[anchor = east] {$2$} rectangle (1, 1) node[pos = 0.5, transform shape, scale = 2] {$2$};
+%             \draw[canvas is xz plane at y={-2 / 5}, fill = gray!30] (0, 0) node[anchor = east] {$5$} rectangle (1, 1) node[pos = 0.5, transform shape, scale = 2] {$5$};
+%             \draw[canvas is xz plane at y={-1 / 5}, fill = white]   (0, 0) node[anchor = east] {$1$} rectangle (1, 1) node[pos = 0.5, transform shape, scale = 2] {$1$};
+%         \end{scope}
+%     \end{tikzpicture}
+%     \caption{\label{fig:perfect_outer_shuffle}Left: sorted deck of $8$ cards, Right: perfect outer shuffle of the deck on the left.}
+% \end{figure}
+
+% Some text with $\heartsuit$, $\spadesuit$, $\diamondsuit$, $\clubsuit$ in it
+\newcommand{\hearts}[1]{%
+    \begin{scope}[every node/.style={transform shape, text = red}]
+        \draw[rounded corners=4pt, fill = white] (1.6, 0) rectangle (0, 2)
+            node[pos = 0.5, scale = 3] {#1};
+        \node[scale = 1, rotate=180] at (1.4, 0.2) {$#1$};
+        \node[scale = 1, rotate=180] at (1.4, 0.5) {$\heartsuit$};
+        \node[scale = 1]             at (0.2, 1.8) {$#1$};
+        \node[scale = 1]             at (0.2, 1.5) {$\heartsuit$};
+    \end{scope}
+}
+\newcommand{\spades}[1]{%
+    \begin{scope}[every node/.style={transform shape, text = black}]
+        \draw[rounded corners=4pt, fill = white] (1.6, 0) rectangle (0, 2)
+            node[pos = 0.5, scale = 3] {#1};
+        \node[scale = 1, rotate=180] at (1.4, 0.2) {$#1$};
+        \node[scale = 1, rotate=180] at (1.4, 0.5) {$\spadesuit$};
+        \node[scale = 1]             at (0.2, 1.8) {$#1$};
+        \node[scale = 1]             at (0.2, 1.5) {$\spadesuit$};
+    \end{scope}
+}
+
+\begin{figure}[ht!]
+    \centering
+    \begin{tikzpicture}[>=latex]
+        % \draw[->] (0, 0, 0) -- (3, 0, 0) node[anchor = west] {$x$};
+        % \draw[->] (0, 0, 0) -- (0, 3, 0) node[anchor = west] {$y$};
+        % \draw[->] (0, 0, 0) -- (0, 0, 3) node[anchor = west] {$z$};
+
+        % \coordinate (op) at (2, 0, 0);
+        % \coordinate (xp) at (1.6, 0, -2);
+        % \coordinate (yp) at (3, 0, 0.5);
+
+        % \draw[->, red] (0, 0, 0) -- (op) node[anchor = south east] {$o_{p}$};
+        % \draw[->, red] (0, 0, 0) -- (xp) node[anchor = south east] {$x_{p}$};
+        % \draw[->, red] (0, 0, 0) -- (yp) node[anchor = north east] {$y_{p}$};
+
+        % \begin{scope}[canvas is plane={O(2, 0, 0)x(1.6, 0, -2)y(3, 0, 0.5)}]
+        %     \draw[->, blue] (0, 0) -- (1, 0) node[anchor = south west] {$x_{i}$};
+        %     \draw[->, blue] (0, 0) -- (0, 1) node[anchor = north west] {$y_{i}$};
+        % \end{scope}
+
+        \begin{scope}
+            \node at ({3 / 2 - 0.2}, 6) {Identity};
+            \foreach \t [count=\f] in {1, 2, 3, 4, 5, 6, 7, 8} {
+                \node (from) at ({\f / 3 - 0.2}, 5.4) {$\f$};
+                \node  (to)  at ({\t / 3 - 0.2}, 4.2) {$\f$};
+                \draw[->, gray] (from.south) to[out=270, in=90] (to.north);
+            }
+
+            \foreach \card [count=\y, evaluate=\y as \h using {\y / 3.5}] in {4,...,2,A} {
+                \begin{scope}[canvas is plane={O(0, \h, 0)x(1, \h, 0.5)y(-0.2, \h, -2)}]
+                    \spades{\card};
+                \end{scope}
+            }
+            \foreach \card [count=\y, evaluate=\y as \h using {(\y + 4) / 3.5}] in {4,...,2,A} {
+                \begin{scope}[canvas is plane={O(0, \h, 0)x(1, \h, 0.5)y(-0.2, \h, -2)}]
+                    \hearts{\card};
+                \end{scope}
+            }
+        \end{scope}
+
+        \begin{scope}[xshift = 3.5cm]
+            \node at ({3 / 2 - 0.2}, 6) {Perfect Shuffle};
+            \foreach \t [count=\f] in {1, 5, 2, 6, 3, 7, 4, 8} {
+                \node (from) at ({\f / 3 - 0.2}, 5.4) {$\f$};
+                \node  (to)  at ({\t / 3 - 0.2}, 4.2) {$\f$};
+                \draw[->, gray] (from.south) to[out=270, in=90] (to.north);
+            }
+    
+            \begin{scope}[canvas is plane={O(0, {1 / 3.5}, 0)x(1, {1 / 3.5}, 0.5)y(-0.2, {1 / 3.5}, -2)}]
+                \spades{4};
+            \end{scope}
+            \begin{scope}[canvas is plane={O(0, {2 / 3.5}, 0)x(1, {2 / 3.5}, 0.5)y(-0.2, {2 / 3.5}, -2)}]
+                \hearts{4};
+            \end{scope}
+            \begin{scope}[canvas is plane={O(0, {3 / 3.5}, 0)x(1, {3 / 3.5}, 0.5)y(-0.2, {3 / 3.5}, -2)}]
+                \spades{3};
+            \end{scope}
+            \begin{scope}[canvas is plane={O(0, {4 / 3.5}, 0)x(1, {4 / 3.5}, 0.5)y(-0.2, {4 / 3.5}, -2)}]
+                \hearts{3};
+            \end{scope}
+            \begin{scope}[canvas is plane={O(0, {5 / 3.5}, 0)x(1, {5 / 3.5}, 0.5)y(-0.2, {5 / 3.5}, -2)}]
+                \spades{2};
+            \end{scope}
+            \begin{scope}[canvas is plane={O(0, {6 / 3.5}, 0)x(1, {6 / 3.5}, 0.5)y(-0.2, {6 / 3.5}, -2)}]
+                \hearts{2};
+            \end{scope}
+            \begin{scope}[canvas is plane={O(0, {7 / 3.5}, 0)x(1, {7 / 3.5}, 0.5)y(-0.2, {7 / 3.5}, -2)}]
+                \spades{A};
+            \end{scope}
+            \begin{scope}[canvas is plane={O(0, {8 / 3.5}, 0)x(1, {8 / 3.5}, 0.5)y(-0.2, {8 / 3.5}, -2)}]
+                \hearts{A};
+            \end{scope}
+        \end{scope}
+
+        \begin{scope}[xshift = 7cm]
+            \node at ({3 / 2 - 0.2}, 6) {Inverse Shuffle};
+            \foreach \t [count=\f] in {1, 3, 5, 7, 2, 4, 6, 8} {
+                \node (from) at ({\f / 3 - 0.2}, 5.4) {$\f$};
+                \node  (to)  at ({\t / 3 - 0.2}, 4.2) {$\f$};
+                \draw[->, gray] (from.south) to[out=270, in=90] (to.north);
+            }
+
+            \begin{scope}[canvas is plane={O(0, {1 / 3.5}, 0)x(1, {1 / 3.5}, 0.5)y(-0.2, {1 / 3.5}, -2)}]
+                \spades{4};
+            \end{scope}
+            \begin{scope}[canvas is plane={O(0, {2 / 3.5}, 0)x(1, {2 / 3.5}, 0.5)y(-0.2, {2 / 3.5}, -2)}]
+                \spades{2};
+            \end{scope}
+            \begin{scope}[canvas is plane={O(0, {3 / 3.5}, 0)x(1, {3 / 3.5}, 0.5)y(-0.2, {3 / 3.5}, -2)}]
+                \hearts{4};
+            \end{scope}
+            \begin{scope}[canvas is plane={O(0, {4 / 3.5}, 0)x(1, {4 / 3.5}, 0.5)y(-0.2, {4 / 3.5}, -2)}]
+                \hearts{2};
+            \end{scope}
+            \begin{scope}[canvas is plane={O(0, {5 / 3.5}, 0)x(1, {5 / 3.5}, 0.5)y(-0.2, {5 / 3.5}, -2)}]
+                \spades{3};
+            \end{scope}
+            \begin{scope}[canvas is plane={O(0, {6 / 3.5}, 0)x(1, {6 / 3.5}, 0.5)y(-0.2, {6 / 3.5}, -2)}]
+                \spades{A};
+            \end{scope}
+            \begin{scope}[canvas is plane={O(0, {7 / 3.5}, 0)x(1, {7 / 3.5}, 0.5)y(-0.2, {7 / 3.5}, -2)}]
+                \hearts{3};
+            \end{scope}
+            \begin{scope}[canvas is plane={O(0, {8 / 3.5}, 0)x(1, {8 / 3.5}, 0.5)y(-0.2, {8 / 3.5}, -2)}]
+                \hearts{A};
+            \end{scope}
+        \end{scope}
+    \end{tikzpicture}
+    \caption{\label{fig:perfect_outer_shuffle}Left: sorted deck of $8$ cards, Center: perfect outer shuffle, Right: inverse perfect outer shuffle}
+\end{figure}
+
+
+
+% Let $\ten{A}$ be a $p_1\times ... \times p_r\times q_1\times ... \times q_r\times d_1\times ...\times d_s$ tensor and $\mat{B}_k$ be $p_k\times q_k$ matrices, then
+% \begin{displaymath}
+%     \t{\Big(\vec{\bigotimes_{k = r}^{1}\mat{B}_k}\Big)}\ten{A}_{([2 r])}
+%         = \t{(\vec{\mat{B}_k})}(\operatorname{\mathcal{R}}(\ten{A})\times_{k\in[r]\backslash j}\t{(\vec{\mat{B}_k})})_{([2 r])}
+% \end{displaymath}
+
+% Let $\ten{A}$ be a $n\times p_1\times ... \times p_r\times q_1\times ... \times q_r$ tensor and $\mat{B}_k$ be $p_k\times q_k$ matrices, then
+% \begin{displaymath}
+%     \ten{A}_{(1)} \vec{\bigotimes_{k = r}^{1}\mat{B}_k}
+%     =
+%     \Big(\ten{R}(\ten{A})\times_{\substack{k + 1\\k\in[r]}}\t{\vec(\mat{B}_k)}\Big)_{(1)}
+% \end{displaymath}
+% where $\ten{R}$ is a permutation of the axis and reshaping of the tensor $\ten{A}$. This axis permutation converts $n\times p_1\times ... \times p_r\times q_1\times ... \times q_r$ to $n\times p_1\times q_1 \times ... \times p_r\times q_r$ and the reshaping vectorizes the axis pairs $p_k\times q_k$ leading to a tensor $\ten{R}(\ten{A})$ of dimensions $n\times p_1 q_1\times ...\times p_r q_r$.
+
+% An alternative way to write this is for each of the $i\in[n]$ vector components is
+% \begin{displaymath}
+%     \Big(\ten{A}_{(1)}\vec{\bigotimes_{k = r}^{1}\mat{B}_k}\Big)_{i}
+%         = \sum_{J\in[(\mat{p}, \mat{q})]}
+%             \ten{A}_{i, J}\prod_{k = 1}^r (B_k)_{J_k, J_{k + r}}
+% \end{displaymath}
+% using the notation $J\in[(\mat{p}, \mat{q})] = [p_1]\times ... \times [p_r]\times [q_1]\times ... \times [q_r]$.
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
                         \section{Pattern Matrices}
@@ -491,11 +1115,17 @@ For two matrices $\mat A$ of dimensions $a_1\times a_2$ and $\mat B$ of dimensio
     \t{\mat{K}_{\mat{p},(j)}} &= \mat{K}_{\mat{p},(j)}^{-1}
 \end{align*}
 
+\section{Further Identities}
+\begin{displaymath}
+    \tr(\mat{A} \mat{B} \mat{C} \mat{D}) = \t{(\vec{\t{\mat{B}}})}(\t{\mat{A}}\otimes \mat{C})\vec{\mat{D}}
+\end{displaymath}
+
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
                           \section{Matrix Calculus}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
+
 \begin{example}
     We want to find the derivative with respect to any of the $r$ symmetric $p_j\times p_j$ matrices $\mat{\Omega}_j$ where $j = 1, ..., r$ of the Kronecker product
     \begin{displaymath}
@@ -511,21 +1141,21 @@ For two matrices $\mat A$ of dimensions $a_1\times a_2$ and $\mat B$ of dimensio
         \d\mat{F} &= \d\bigotimes_{k = r}^1 \mat{\Omega}_k
             = \sum_{j = 1}^r \bigotimes_{k = r}^{j+1}\mat{\Omega}_k\otimes\d\mat{\Omega}_j\otimes\bigotimes_{k = j - 1}^{1}\mat{\Omega}_k
             = \sum_{j = 1}^r \overline{\mat{\Omega}}_j\otimes\d\mat{\Omega}_j\otimes\underline{\mat{\Omega}}_j \\
-            &= \sum_{j = 1}^r \mat{K}_{\overline{p}_jp_j,\underline{p}_j}(\underline{\mat{\Omega}}_j\otimes\overline{\mat{\Omega}}_j\otimes\d\mat{\Omega}_j)\mat{K}_{\underline{p}_j,\overline{p}_jp_j}
+            &= \sum_{j = 1}^r \mat{K}_{\overline{p}_j p_j,\underline{p}_j}(\underline{\mat{\Omega}}_j\otimes\overline{\mat{\Omega}}_j\otimes\d\mat{\Omega}_j)\mat{K}_{\underline{p}_j,\overline{p}_j p_j}
     \end{align*}
     By vectorizing this transforms to
     \begin{align*}
-        \d\vec\mat{F} &= \sum_{j = 1}^r (\mat{K}_{\overline{p}_jp_j,\underline{p}_j}\otimes\mat{K}_{\overline{p}_jp_j,\underline{p}_j})\vec(\underline{\mat{\Omega}}_j\otimes\overline{\mat{\Omega}}_j\otimes\d\mat{\Omega}_j) \\
-            &= \sum_{j = 1}^r (\mat{K}_{\overline{p}_jp_j,\underline{p}_j}\otimes\mat{K}_{\overline{p}_jp_j,\underline{p}_j})(\mat{I}_{\overline{p}_j\underline{p}_j}\otimes\mat{K}_{p_j,\overline{p}_j\underline{p}_j}\otimes\mat{I}_{p_j})(\vec(\underline{\mat{\Omega}}_j\otimes\overline{\mat{\Omega}}_j)\otimes\d\vec\mat{\Omega}_j) \\
-            &= \sum_{j = 1}^r (\mat{K}_{\overline{p}_jp_j,\underline{p}_j}\otimes\mat{K}_{\overline{p}_jp_j,\underline{p}_j})(\mat{I}_{\overline{p}_j\underline{p}_j}\otimes\mat{K}_{p_j,\overline{p}_j\underline{p}_j}\otimes\mat{I}_{p_j})(\vec(\underline{\mat{\Omega}}_j\otimes\overline{\mat{\Omega}}_j)\otimes\mat{I}_{p_j^2})\,\d\vec\mat{\Omega}_j \\
+        \d\vec\mat{F} &= \sum_{j = 1}^r (\mat{K}_{\overline{p}_j p_j,\underline{p}_j}\otimes\mat{K}_{\overline{p}_jp_j,\underline{p}_j})\vec(\underline{\mat{\Omega}}_j\otimes\overline{\mat{\Omega}}_j\otimes\d\mat{\Omega}_j) \\
+            &= \sum_{j = 1}^r (\mat{K}_{\overline{p}_j p_j,\underline{p}_j}\otimes\mat{K}_{\overline{p}_jp_j,\underline{p}_j})(\mat{I}_{\overline{p}_j\underline{p}_j}\otimes\mat{K}_{p_j,\overline{p}_j\underline{p}_j}\otimes\mat{I}_{p_j})(\vec(\underline{\mat{\Omega}}_j\otimes\overline{\mat{\Omega}}_j)\otimes\d\vec\mat{\Omega}_j) \\
+            &= \sum_{j = 1}^r (\mat{K}_{\overline{p}_j p_j,\underline{p}_j}\otimes\mat{K}_{\overline{p}_jp_j,\underline{p}_j})(\mat{I}_{\overline{p}_j\underline{p}_j}\otimes\mat{K}_{p_j,\overline{p}_j\underline{p}_j}\otimes\mat{I}_{p_j})(\vec(\underline{\mat{\Omega}}_j\otimes\overline{\mat{\Omega}}_j)\otimes\mat{I}_{p_j^2})\,\d\vec\mat{\Omega}_j \\
     \end{align*}
     leading to
     \begin{displaymath}
-        \D\mat{F}(\mat{\Omega}_j) = (\mat{K}_{\overline{p}_jp_j,\underline{p}_j}\otimes\mat{K}_{\overline{p}_jp_j,\underline{p}_j})(\mat{I}_{\overline{p}_j\underline{p}_j}\otimes\mat{K}_{p_j,\overline{p}_j\underline{p}_j}\otimes\mat{I}_{p_j})(\vec(\underline{\mat{\Omega}}_j\otimes\overline{\mat{\Omega}}_j)\otimes\mat{I}_{p_j^2})
+        \D\mat{F}(\mat{\Omega}_j) = (\mat{K}_{\overline{p}_j p_j,\underline{p}_j}\otimes\mat{K}_{\overline{p}_jp_j,\underline{p}_j})(\mat{I}_{\overline{p}_j\underline{p}_j}\otimes\mat{K}_{p_j,\overline{p}_j\underline{p}_j}\otimes\mat{I}_{p_j})(\vec(\underline{\mat{\Omega}}_j\otimes\overline{\mat{\Omega}}_j)\otimes\mat{I}_{p_j^2})
     \end{displaymath}
     for each $j = 1, ..., r$. Note that the $p^2\times p^2$ matrices
     \begin{displaymath}
-        \mat{P}_j = (\mat{K}_{\overline{p}_jp_j,\underline{p}_j}\otimes\mat{K}_{\overline{p}_jp_j,\underline{p}_j})(\mat{I}_{\overline{p}_j\underline{p}_j}\otimes\mat{K}_{p_j,\overline{p}_j\underline{p}_j}\otimes\mat{I}_{p_j})
+        \mat{P}_j = (\mat{K}_{\overline{p}_j p_j,\underline{p}_j}\otimes\mat{K}_{\overline{p}_jp_j,\underline{p}_j})(\mat{I}_{\overline{p}_j\underline{p}_j}\otimes\mat{K}_{p_j,\overline{p}_j\underline{p}_j}\otimes\mat{I}_{p_j})
     \end{displaymath}
     are permutations.
 \end{example}
@@ -545,12 +1175,42 @@ Let $X, Y$ be $p, q$ dimensional random variables, respectively. Furthermore, le
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
                                 \section{Proofs}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
 \begin{proof}[Proof of Theorem~\ref{thm:sdr}]
-
-
-
+    By Theorem~1 in \cite{sdr-BuraDuarteForzani2016} we have that
+    \begin{displaymath}
+        \mat{R}(\ten{X}) = \t{\mat{B}}(\mat{t}(\ten{X}) - \E\mat{t}(\ten{X}))
+    \end{displaymath}
+    is a sufficient reduction under the exponential family \eqref{eq:exp-family} where $\mat{B}\in\mathbb{R}^{p(p + 1)\times q}$ with $\Span(\mat{B}) = \Span(\{\mat{\eta}_Y - \E_{Y}\mat{\eta}_Y : Y\in\mathcal{S}_Y\})$. With $\E_Y\mat{\eta}_{Y,1} \equiv c_1\E[\overline{\ten{\eta}}_1 - \ten{F}_Y\times_{k\in[r]}\mat{\alpha}_k] = c_1 \overline{\ten{\eta}}_1$ cause $\E_Y\ten{F}_Y = 0$ and $\mat{\eta}_{y,2}$ does not depend on $y$ (regardless of the notation) we get
+    \begin{displaymath}
+        \mat{\eta}_Y - \E_{Y}\mat{\eta}_Y = \begin{pmatrix}
+            \mat{\eta}_{Y,1} - \E_{Y}\mat{\eta}_{Y,1} \\
+            \mat{\eta}_{Y,2} - \E_{Y}\mat{\eta}_{Y,2}
+        \end{pmatrix} = \begin{pmatrix}
+            c_1\vec(\ten{F}_Y\times_{k\in[r]}\mat{\alpha}_k) \\
+            \mat{0}
+        \end{pmatrix}.
+    \end{displaymath}
+    Noting that
+    \begin{displaymath}
+        c_1\vec(\ten{F}_Y\times_{k\in[r]}\mat{\alpha}_k)
+            = c_1\Big( \bigotimes_{k = r}^{1}\mat{\alpha}_k \Big)\vec(\ten{F}_Y)
+    \end{displaymath}
+    we get
+    \begin{displaymath}
+        \mat{B} = \begin{pmatrix}
+            c_1 \bigotimes_{k = r}^{1}\mat{\alpha}_k \\
+            \mat{0}_{p^2\times q}
+        \end{pmatrix}.
+    \end{displaymath}
+    Simplifying leads to
+    \begin{displaymath}
+        \t{\mat{B}}(\mat{t}(\ten{X}) - \E\mat{t}(\ten{X})) = c_1 \Big( \bigotimes_{k = r}^{1}\mat{\alpha}_k \Big)(\mat{t}_1(\ten{X}) - \E\mat{t}_1(\ten{X})).
+    \end{displaymath}
+    Now note $\Span(\mat{A}) = \Span(c \mat{A})$ for any matrix $\mat{A}$ and non-zero scalar $c$ as well as the definition $\mat{t}_1(\ten{X}) = \vec{\ten{X}}$ which concludes the proves.
 \end{proof}
-Illustration of dimensions
+
+Before we start the prove of Theorem~\ref{thm:grad} an illustration of the different dimensions in one of the gradients;
 \begin{displaymath}
     \underbrace{ \mat{D}_{p_j}\t{\mat{D}_{p_j}} }_{\makebox[0pt]{\scriptsize $p_j^2\times p_j^2$}}
     %
@@ -565,6 +1225,9 @@ Illustration of dimensions
     }_{\makebox[0pt]{\scriptsize $(p/p_j)^2\times 1$}}
 \end{displaymath}
 
+\begin{proof}[Proof of Theorem~\ref{thm:grad}]
+    \todo{THIS}
+\end{proof}
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \section{Distributions}
@@ -603,7 +1266,7 @@ where $p = \prod_{i = 1}^r p_i$. This is equivalent to the vectorized $\vec\ten{
         \langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle
             = \t{\vec(\ten{X} - \mu)}\vec((\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\}) \\
             = \t{\vec(\ten{X} - \mu)}(\mat{\Delta}_r^{-1}\otimes ...\otimes\mat{\Delta}_1^{-1})\vec(\ten{X} - \mu) \\
-            = \t{(\vec\ten{X} - \vec\mu)}(\mat{\Delta}_r\otimes ...\otimes\mat{\Delta}_1)^{-1}(\vec\ten{X} - \vec\mu).
+            = \t{(\vec{\ten{X}} - \vec{\mu})}(\mat{\Delta}_r\otimes ...\otimes\mat{\Delta}_1)^{-1}(\vec\ten{X} - \vec\mu).
     \end{multline*}
     Next, using a property of the determinant of a Kronecker product $|\mat{\Delta}_1\otimes\mat{\Delta}_2| = |\mat{\Delta}_1|^{p_2}|\mat{\Delta}_2|^{p_1}$ yields
     \begin{displaymath}
diff --git a/tensorPredictors/NAMESPACE b/tensorPredictors/NAMESPACE
index 751dcd0..efe36de 100644
--- a/tensorPredictors/NAMESPACE
+++ b/tensorPredictors/NAMESPACE
@@ -47,11 +47,13 @@ export(mkm)
 export(mlm)
 export(mtvk)
 export(num.deriv)
+export(num.deriv2)
 export(reduce)
 export(rowKronecker)
 export(rtensornorm)
 export(tensor_predictor)
 export(ttm)
+export(ttt)
 export(vech)
 export(vech.index)
 export(vech.pinv)
diff --git a/tensorPredictors/R/GMLM.R b/tensorPredictors/R/GMLM.R
index f057c89..7a442bf 100644
--- a/tensorPredictors/R/GMLM.R
+++ b/tensorPredictors/R/GMLM.R
@@ -32,10 +32,12 @@ make.gmlm.family <- function(name) {
                 # Extract main mode covariance directions
                 # Note: (the directions are transposed!)
                 XDirs <- Map(function(Sigma) {
-                    with(La.svd(Sigma, nu = 0), sqrt(d) * vt)
+                    SVD <- La.svd(Sigma, nu = 0)
+                    sqrt(SVD$d) * SVD$vt
                 }, XSigmas)
                 YDirs <- Map(function(Sigma) {
-                    with(La.svd(Sigma, nu = 0), sqrt(d) * vt)
+                    SVD <- La.svd(Sigma, nu = 0)
+                    sqrt(SVD$d) * SVD$vt
                 }, YSigmas)
 
                 alphas <- Map(function(xdir, ydir) {
@@ -80,7 +82,6 @@ make.gmlm.family <- function(name) {
                 # retrieve dimensions
                 n <- tail(dim(X), 1)        # sample size
                 p <- head(dim(X), -1)       # predictor dimensions
-                q <- head(dim(Fy), -1)      # response dimensions
                 r <- length(p)              # single predictor/response tensor order
 
                 ## "Inverse" Link: Tensor Normal Specific
@@ -139,10 +140,12 @@ make.gmlm.family <- function(name) {
                 # Extract main mode covariance directions
                 # Note: (the directions are transposed!)
                 XDirs <- Map(function(Sigma) {
-                    with(La.svd(Sigma, nu = 0), sqrt(d) * vt)
+                    SVD <- La.svd(Sigma, nu = 0)
+                    sqrt(SVD$d) * SVD$vt
                 }, XSigmas)
                 YDirs <- Map(function(Sigma) {
-                    with(La.svd(Sigma, nu = 0), sqrt(d) * vt)
+                    SVD <- La.svd(Sigma, nu = 0)
+                    sqrt(SVD$d) * SVD$vt
                 }, YSigmas)
 
                 alphas <- Map(function(xdir, ydir) {
@@ -158,42 +161,6 @@ make.gmlm.family <- function(name) {
                 )
             }
 
-            # initialize <- function(X, Fy) {
-            #     r <- length(dim(X)) - 1L
-
-            #     # Mode-Covariances
-            #     XSigmas <- mcov(X, sample.axis = r + 1L)
-            #     YSigmas <- mcov(Fy, sample.axis = r + 1L)
-
-            #     # Extract main mode covariance directions
-            #     # Note: (the directions are transposed!)
-            #     XDirs <- Map(function(Sigma) {
-            #         with(La.svd(Sigma, nu = 0), sqrt(d) * vt)
-            #     }, XSigmas)
-            #     YDirs <- Map(function(Sigma) {
-            #         with(La.svd(Sigma, nu = 0), sqrt(d) * vt)
-            #     }, YSigmas)
-
-            #     alphas <- Map(function(xdir, ydir) {
-            #         s <- min(ncol(xdir), nrow(ydir))
-            #         crossprod(xdir[seq_len(s), , drop = FALSE],
-            #                   ydir[seq_len(s), , drop = FALSE])
-            #     }, XDirs, YDirs)
-
-            #     # Scatter matrices from Residuals (intercept not considered)
-            #     Deltas <- mcov(X - mlm(Fy, alphas), sample.axis = r + 1L)
-            #     Omegas <- Map(solve, Deltas)
-
-            #     # and the intercept
-            #     eta1 <- mlm(rowMeans(X, dims = r), Deltas)
-
-            #     list(
-            #         eta1 = eta1,
-            #         alphas = alphas,
-            #         Omegas = Omegas
-            #     )
-            # }
-
             params <- function(Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
                 # number of observations
                 n <- tail(dim(Fy), 1)
@@ -243,7 +210,6 @@ make.gmlm.family <- function(name) {
                 # retrieve dimensions
                 n <- tail(dim(X), 1)        # sample size
                 p <- head(dim(X), -1)       # predictor dimensions
-                q <- head(dim(Fy), -1)      # response dimensions
                 r <- length(p)              # single predictor/response tensor order
 
                 ## "Inverse" Link: Ising Model Specific
diff --git a/tensorPredictors/R/num_deriv.R b/tensorPredictors/R/num_deriv.R
index b5a513e..def2d28 100644
--- a/tensorPredictors/R/num_deriv.R
+++ b/tensorPredictors/R/num_deriv.R
@@ -19,3 +19,18 @@ num.deriv <- function(F, X, h = 1e-6, sym = FALSE) {
         })
     }
 }
+
+#' Second numeric derivative
+#'
+#' @export
+num.deriv2 <- function(F, X, Y, h = 1e-6, symX = FALSE, symY = FALSE) {
+    if (missing(Y)) {
+        num.deriv(function(x) {
+            num.deriv(function(z) F(z), x, h = h, sym = symX)
+        }, X, h = h, sym = symX)
+    } else {
+        num.deriv(function(y) {
+            num.deriv(function(x) F(x, y), X, h = h, sym = symX)
+        }, Y, h = h, sym = symY)
+    }
+}
diff --git a/tensorPredictors/R/ttm.R b/tensorPredictors/R/ttm.R
index ce7bdc8..a2c570d 100644
--- a/tensorPredictors/R/ttm.R
+++ b/tensorPredictors/R/ttm.R
@@ -14,6 +14,7 @@
 #' @export
 ttm <- function(T, M, mode = length(dim(T)), transposed = FALSE) {
     storage.mode(T) <- storage.mode(M) <- "double"
+    dim(M) <- c(NROW(M), NCOL(M))
     .Call("C_ttm", T, M, as.integer(mode), as.logical(transposed))
 }
 
diff --git a/tensorPredictors/R/ttt.R b/tensorPredictors/R/ttt.R
new file mode 100644
index 0000000..b5b0586
--- /dev/null
+++ b/tensorPredictors/R/ttt.R
@@ -0,0 +1,14 @@
+#' Tensor Times Tensor
+#'
+#' @examples
+#' A <- array(rnorm(3 * 7 * 11 * 17), dim = c(3, 7, 11, 17))
+#' B <- array(rnorm(17 * 2 * 11 * 5 * 7), dim = c(17, 2, 11, 5, 7))
+#'
+#' ttt(A, B, 2:4, c(5, 3, 1))
+#'
+#' @export
+ttt <- function(A, B, modesA, modesB = modesA, dimsA = dim(A), dimsB = dim(B)) {
+    R <- crossprod(mat(A, modesA, dimsA), mat(B, modesB, dimsB))
+    dim(R) <- c(dim(A)[-modesA], dim(B)[-modesB])
+    R
+}