tensor_predictors/LaTeX/GMLM.tex

\documentclass[a4paper, 12pt]{article}

\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{fullpage}
\usepackage{amsmath, amssymb, amstext, amsthm}
\usepackage{bm}                         % \boldsymbol and italic corrections, ...
\usepackage[pdftex]{hyperref}
\usepackage{makeidx}                    % Index (Symbols, Names, ...)
\usepackage{xcolor, graphicx}           % colors and including images
\usepackage{tikz}
\usetikzlibrary{calc}
\usepackage[
    % backend=bibtex,
    style=authoryear-comp
]{biblatex}
\usepackage{algorithm, algpseudocode}   % Pseudo Codes / Algorithms

% Document meta into
\title{Generalized Multi-Linear Model for the Quadratic Exponential Family}
\author{Daniel Kapla}
\date{\today}
% Set PDF title, author and creator.
\AtBeginDocument{
    \hypersetup{
        pdftitle = {Generalized Multi-Linear Model for the Quadratic Exponential Family},
        pdfauthor = {Daniel Kapla},
        pdfcreator = {\pdftexbanner}
    }
}

\makeindex

% Bibliography resource(s)
\addbibresource{main.bib}

% Setup environments
% Theorem, Lemma
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{example}{Example}
% Definition
\theoremstyle{definition}
\newtheorem{defn}{Definition}
% Remark
\theoremstyle{remark}
\newtheorem{remark}{Remark}

% Define math macros
\newcommand{\mat}[1]{\boldsymbol{#1}}
\newcommand{\ten}[1]{\mathcal{#1}}
\renewcommand{\vec}{\operatorname{vec}}
\newcommand{\unvec}{\operatorname{vec^{-1}}}
\newcommand{\reshape}[1]{\operatorname{reshape}_{#1}}
\newcommand{\vech}{\operatorname{vech}}
\newcommand{\rank}{\operatorname{rank}}
\newcommand{\diag}{\operatorname{diag}}
\DeclareMathOperator{\tr}{tr}
\DeclareMathOperator{\var}{Var}
\DeclareMathOperator{\cov}{Cov}
\DeclareMathOperator{\Span}{span}
\DeclareMathOperator{\E}{\operatorname{\mathbb{E}}}
% \DeclareMathOperator{\independent}{{\bot\!\!\!\bot}}
\DeclareMathOperator*{\argmin}{{arg\,min}}
\DeclareMathOperator*{\argmax}{{arg\,max}}
\newcommand{\D}{\textnormal{D}}         % derivative
\renewcommand{\d}{\textnormal{d}}       % differential
\renewcommand{\t}[1]{{#1^{\prime}}}     % matrix transpose
\newcommand{\pinv}[1]{{#1^{\dagger}}}   % `Moore-Penrose pseudoinverse`
\newcommand{\invlink}{\widetilde{\mat{g}}}

\newcommand{\todo}[1]{{\color{red}TODO: #1}}
\newcommand{\effie}[1]{{\color{blue}Effie: #1}}

% Pseudo Code Commands
\newcommand{\algorithmicbreak}{\textbf{break}}
\newcommand{\Break}{\State \algorithmicbreak}

\begin{document}

\maketitle

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{abstract}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    We propose a method for sufficient dimension reduction of Tensor-valued predictor (multi dimensional arrays) for regression or classification. We assume an Quadratic Exponential Family for a Generalized Linear Model in an inverse regression setting where the relation via a link is of a multi-linear nature.
    Using a multi-linear relation allows to perform per-axis reductions which reduces the total number of parameters drastically for higher order Tensor-valued predictors. Under the Exponential Family we derive maximum likelihood estimates for the multi-linear sufficient dimension reduction of the Tensor-valued predictors. Furthermore, we provide an estimation algorithm which utilizes the Tensor structure allowing efficient implementations. The performance of the method is illustrated via simulations and real world examples are provided.
\end{abstract}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
                        \section{Introduction}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Notation}
Vectors are write as boldface lowercase letters (e.g. $\mat a$, $\mat b$), matrices use boldface uppercase or Greek letters (e.g. $\mat A$, $\mat B$, $\mat\alpha$, $\mat\Delta$). The identity matrix of dimensions $p\times p$ is denoted by $\mat{I}_p$ and the commutation matrix as $\mat{K}_{p, q}$ or $\mat{K}_p$ is case of $p = q$. Tensors, meaning multi-dimensional arrays of order at least 3, use uppercase calligraphic letters (e.g. $\ten{A}$, $\ten{B}$, $\ten{X}$, $\ten{Y}$, $\ten{F}$). Boldface indices (e.g. $\mat{i}, \mat{j}, \mat{k}$) denote multi-indices $\mat{i} = (i_1, ..., i_r)\in[\mat{d}]$ where the bracket notation is a shorthand for $[r] = \{1, ..., r\}$ which in conjunction with a multi-index as argument means $[\mat{d}] = [d_1]\times ... \times[d_K]$.

Let $\ten{A} = (a_{i_1,...,i_r})\in\mathbb{R}^{d_1\times ...\times d_r}$ be an order\footnote{Also called rank, therefore the variable name $r$, but this term is \emph{not} used as it leads to confusion with the rank as in ``the rank of a matrix''.} $r$ tensor where $r\in\mathbb{N}$ is the number of modes or axis of $\ten{A}$. For matrices $\mat{B}_k\in\mathbb{R}^{p_k\times d_k}$ with $k\in[r] = \{1, 2, ..., r\}$ the \emph{multi-linear multiplication} is defined element wise as
\begin{displaymath}
    (\ten{A}\times\{\mat{B}_1, ..., \mat{B}_r\})_{j_1, ..., j_r} = \sum_{i_1, ..., i_r = 1}^{d_1, ..., d_r} a_{i_1, ..., i_r}(B_{1})_{j_1, i_1} \cdots (B_{r})_{j_r, i_r}
\end{displaymath}
which results in an order $r$ tensor of dimensions $p_1\times ...\times p_k)$. With this the \emph{$k$-mode product} between the tensor $\ten{A}$ with the matrix $\mat{B}_k$ is given by
\begin{displaymath}
    \mat{A}\times_k\mat{B}_k = \ten{A}\times\{\mat{I}_{d_1}, ..., \mat{I}_{d_{k-1}}, \mat{B}_{k}, \mat{I}_{d_{k+1}}, ..., \mat{I}_{d_r}\}.
\end{displaymath}
Furthermore, the notation $\ten{A}\times_{k\in S}$ is a short hand for writing the iterative application if the mode product for all indices in $S\subset[r]$. For example $\ten{A}\times_{k\in\{2, 5\}}\mat{B}_k = \ten{A}\times_2\mat{B}_2\times_5\mat{B}_5$. By only allowing $S$ to be a set this notation is unambiguous because the mode products commutes for different modes $j\neq k\Rightarrow\ten{A}\times_j\mat{B}_j\times_k\mat{B}_k = \ten{A}\times_k\mat{B}_k\times_j\mat{B}_j$.

The \emph{inner product} between two tensors of the same order and dimensions is
\begin{displaymath}
    \langle\ten{A}, \ten{B}\rangle = \sum_{i_1, ..., i_r} a_{i_1, ..., i_r}b_{i_1, ..., i_r}
\end{displaymath}
with which the \emph{Frobenius Norm} $\|\ten{A}\|_F = \sqrt{\langle\ten{A}, \ten{A}\rangle}$. Of interest is also the \emph{maximum norm} $\|\ten{A}\|_{\infty} = \max_{i_1, ..., i_K} a_{i_1, ..., i_K}$. Furthermore, the Frobenius and maximum norm are also used for matrices while for a vector $\mat{a}$ the \emph{2 norm} is $\|\mat{a}\|_2 = \sqrt{\langle\mat{a}, \mat{a}\rangle}$.

Matrices and tensor can be \emph{vectorized} by the \emph{vectorization} operator $\vec$. For tensors of order at least $2$ the \emph{flattening} (or \emph{unfolding} or \emph{matricization}) is a reshaping of the tensor into a matrix along an particular mode. For a tensor $\ten{A}$ of order $r$ and dimensions $d_1, ..., d_r$ the $k$-mode unfolding $\ten{A}_{(k)}$ is a $d_k\times \prod_{l=1, l\neq k}d_l$ matrix. For the tensor $\ten{A} = (a_{i_1,...,i_r})\in\mathbb{R}^{d_1, ..., d_r}$ the elements of the $k$ unfolded tensor $\ten{A}_{(k)}$ are
\begin{displaymath}
    (\ten{A}_{(k)})_{i_k, j} = a_{i_1, ..., i_r}\quad\text{ with }\quad j = 1 + \sum_{\substack{l = 1\\l \neq k}}^r (i_l - 1) \prod_{\substack{m = 1\\m\neq k}}^{l - 1}d_m.
\end{displaymath}

The rank of a tensor $\ten{A}$ of dimensions $d_1\times ...\times d_r$ is given by a vector $\rank{\ten{A}} = (a_1, ..., a_r)\in[d_1]\times...\times[d_r]$ where $a_k = \rank(\ten{A}_{(k)})$ is the usual matrix rank of the $k$ unfolded tensor.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
                \section{Quadratic Exponential Family GLM}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{description}
    \item[Distribution]
        \begin{displaymath}
            f_{\mat{\theta}_y}(\ten{X}\mid Y = y) = h(\ten{X})\exp(\t{\mat{\eta}(\mat{\theta}_y)}\mat{t}(\ten{X}) - b(\mat{\theta}_y))
        \end{displaymath}
    \item[(inverse) link]
        \begin{displaymath}
            \invlink(\mat{\eta}(\mat{\theta}_y)) = \E_{\mat{\theta}_y}[\mat{t}(\ten{X})\mid Y = y]
        \end{displaymath}
    \item[(multi) linear predictor] For
        \begin{displaymath}
            \mat{\eta}_y = \mat{\eta}(\mat{\theta}_y) = \begin{pmatrix}
                \mat{\eta}_1(\mat{\theta}_y) \\
                \mat{\eta}_2(\mat{\theta}_y)
            \end{pmatrix},\qquad
            \mat{t}(\ten{X}) = \begin{pmatrix}
                \mat{t}_1(\ten{X}) \\
                \mat{t}_2(\ten{X})
            \end{pmatrix} = \begin{pmatrix}
                \vec{\ten{X}} \\
                \vec{\ten{X}}\otimes\vec{\ten{X}}
            \end{pmatrix}
        \end{displaymath}
        where
        \begin{align*}
            \mat{\eta}_1(\mat{\theta}_y) &= \mat{\eta}_{y,1} = c_1 \vec(\overline{\ten{\eta}}_1 + \ten{F}_y\times_{k\in[r]}\mat{\alpha}_k) \\
            \mat{\eta}_2(\mat{\theta}_y) &= \mat{\eta}_{y,2} = c_2 \vec{\bigotimes_{k = r}^1 \mat{\Omega}_k}
        \end{align*}
        with model parameters $\overline{\ten{\eta}}_1, \mat{\alpha}_1, ..., \mat{\alpha}_r, \mat{\Omega}_1, ..., \mat{\Omega}_r$ where $\overline{\ten{\eta}}_1$ is a $p_1\times ... \times p_r$ tensor, $\mat{\alpha}_j$ are $p_j\times q_j$ unconstrained matrices and $\mat{\Omega}_j$ are symmetric $p_j\times p_j$ matrices for each of the $j = 1, ..., r$ modes. Finally, $c_1$ and $c_2$ are known constants simplifying modeling for specific distributions.
\end{description}
% With that approach we get
% \begin{displaymath}
%     \t{\mat{\eta}(\mat{\theta}_y)}\mat{t}(\ten{X}) = \t{\mat{\eta}_{y,1}}\mat{t}_1(\ten{X}) + \t{\mat{\eta}_{y,2}}\mat{t}_2(\ten{X}) = \langle\overline{\ten{\eta}}_1 + \ten{F}_y\times_{k\in[r]}\mat{\alpha}_k, \ten{X} \rangle + \langle\ten{X}\times_{k\in[r]}\mat{\Omega}_k, \ten{X} \rangle.
% \end{displaymath}

\begin{theorem}[Log-Likelihood and Score]
    For $n$ i.i.d. observations $(\ten{X}_i, y_i), i = 1, ..., n$ the log-likelihood has the form
    \begin{displaymath}
        l(\mat{\eta}_y) = \sum_{i = 1}^n(\log h(\ten{X}_i) + c_1\langle\overline{\ten{\eta}}_1 + \ten{F}_{y_i}\times_{k\in[r]}\mat{\alpha}_k, \ten{X}_i \rangle + c_2\langle\ten{X}_i\times_{k\in[r]}\mat{\Omega}_k, \ten{X}_i \rangle - b(\mat{\eta}_{y_i})).
    \end{displaymath}
    % The MLE estimate for the intercept term $\overline{\ten{\eta}}_1$ is
    % \begin{displaymath}
    %     \widehat{\ten{\eta}}_1 = \frac{1}{n}\sum_{i = 1}^n \ten{X}_i
    % \end{displaymath}
    The gradients with respect to the GLM parameters $\overline{\ten{\eta}}_1$, $\mat{\alpha}_j$ and $\mat{\Omega}_j$ for $j = 1, ..., r$ are given by
    \begin{align*}
        \nabla_{\overline{\ten{\eta}}_1}l &= c_1\sum_{i = 1}^n \reshape{\mat{p}}(\mat{t}_1(\ten{X}_i) - \invlink_1(\mat{\eta}_{y_i})), \\
        \nabla_{\mat{\alpha}_j}l &= c_1 \sum_{i = 1}^n \reshape{\mat{p}}(\mat{t}_1(\ten{X}_i) - \invlink_1(\mat{\eta}_{y_i}))_{(j)}\t{(\ten{F}_{y_i}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}}, \\
        \vec\nabla_{\mat{\Omega}_j}l &= c_2 \mat{D}_{p_j}\t{\mat{D}_{p_j}} \reshape{(\mat{p}, \mat{p})}\!\!\Big(\sum_{i = 1}^n(\mat{t}_2(\ten{X}_i) - \invlink_2(\mat{\eta}_{y_i}))\Big)_{(j, r + j)}\vec\bigotimes_{\substack{k = r\\k\neq j}}^{1}\mat{\Omega}_k
    \end{align*}
    % The Fisher Information for the GLM parameters is given block wise by
    % \begin{displaymath}
    %     % \mathcal{I}_{\ten{X}\mid Y = y}(\vec{\overline{\ten{\eta}}_1}, \vec\mat{\alpha}_1, ..., \vec\mat{\alpha}_r, \vec\mat{\Omega}_1, ..., \vec\mat{\Omega}_r) = \begin{pmatrix}
    %     \mathcal{I}_{\ten{X}\mid Y = y} = \begin{pmatrix}
    %         \mathcal{I}(\overline{\ten{\eta}}_1) & \mathcal{I}(\overline{\ten{\eta}}_1, \mat{\alpha}_1) & \cdots & \mathcal{I}(\overline{\ten{\eta}}_1, \mat{\alpha}_r) & \mathcal{I}(\overline{\ten{\eta}}_1, \mat{\Omega}_1) & \cdots & \mathcal{I}(\overline{\ten{\eta}}_1, \mat{\Omega}_r) \\
    %         \mathcal{I}(\mat{\alpha}_1, \overline{\ten{\eta}}_1) & \mathcal{I}(\mat{\alpha}_1) & \cdots & \mathcal{I}(\mat{\alpha}_1, \mat{\alpha}_r) & \mathcal{I}(\mat{\alpha}_1, \mat{\Omega}_1) & \cdots & \mathcal{I}(\mat{\alpha}_1, \mat{\Omega}_r) \\
    %         \vdots & \vdots & \ddots & \vdots & \vdots & \ddots & \vdots \\
    %         \mathcal{I}(\mat{\alpha}_r, \overline{\ten{\eta}}_1) & \mathcal{I}(\mat{\alpha}_r, \mat{\alpha}_1) & \cdots & \mathcal{I}(\mat{\alpha}_r) & \mathcal{I}(\mat{\alpha}_r, \mat{\Omega}_1) & \cdots & \mathcal{I}(\mat{\alpha}_r, \mat{\Omega}_r) \\
    %         \mathcal{I}(\mat{\Omega}_1, \overline{\ten{\eta}}_1) & \mathcal{I}(\mat{\Omega}_1, \mat{\alpha}_1) & \cdots & \mathcal{I}(\mat{\Omega}_1, \mat{\alpha}_r) & \mathcal{I}(\mat{\Omega}_1) & \cdots & \mathcal{I}(\mat{\Omega}_1, \mat{\Omega}_r) \\
    %         \vdots & \vdots & \ddots & \vdots & \vdots & \ddots & \vdots \\
    %         \mathcal{I}(\mat{\Omega}_r, \overline{\ten{\eta}}_1) & \mathcal{I}(\mat{\Omega}_r, \mat{\alpha}_1) & \cdots & \mathcal{I}(\mat{\Omega}_r) & \mathcal{I}(\mat{\Omega}_r, \mat{\Omega}_1) & \cdots & \mathcal{I}(\mat{\Omega}_r)
    %     \end{pmatrix}
    % \end{displaymath}
    % where
    % \begin{align*}
    %     \mathcal{I}(\overline{\ten{\eta}}_1) &= -\sum_{i = 1}^n \cov_{\mat{\theta}_{y_i}}(\vec\ten{X}\mid Y = y_i), \\
    %     \mathcal{I}(\mat{\alpha}_j) &= -\sum_{i = 1}^n ((\ten{F}_{y_i}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}\otimes\mat{I}_{p_j})\mat{K}_{\mat{p},(j)}\cov_{\mat{\theta}_{y_i}}(\vec\ten{X}\mid Y = y_i)\t{\mat{K}_{\mat{p},(j)}}(\t{(\ten{F}_{y_i}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}}\otimes\mat{I}_{p_j}), \\
    %     \mathcal{I}(\mat{\alpha}_j) &= -\sum_{i = 1}^n \todo{continue}
    % \end{align*}

    % \todo{Fisher Information}
\end{theorem}

Illustration of dimensions
\begin{displaymath}
    \underbrace{ \mat{D}_{p_j}\t{\mat{D}_{p_j}} }_{\makebox[0pt]{\scriptsize $p_j^2\times p_j^2$}}
    %
    \underbrace{%
        \overbrace{\reshape{(\mat{p}, \mat{p})}\!\!\Big(\sum_{i = 1}^n
            \underbrace{ (\mat{t}_2(\ten{X}_i) - \invlink_2(\mat{\eta}_{y_i}) }_{p^2\times 1}
        \Big)}^{\substack{\text{(tensor of order $2 r$)}\\p_1\times p_2\times ... \times p_r\times p_1\times p_2\times ... \times p_r}} \!\!\makebox[0pt]{\phantom{\Big)}}_{(j, r + j)}
    }_{\substack{p_j^2\times (p / p_j)^2\\\text{(matricized / put $j$ mode axis to the front)}}}
    %
    \underbrace{%
        \vec \overbrace{ \bigotimes_{\substack{k = r\\k\neq j}}^{1}\mat{\Omega}_j }^{\makebox[0pt]{\scriptsize $(p/p_j)\times (p/p_j)$}}
    }_{\makebox[0pt]{\scriptsize $(p/p_j)^2\times 1$}}
\end{displaymath}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
                \section{Sufficient Dimension Reduction}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
By Theorem~1 in \cite{sdr-BuraDuarteForzani2016} we have that
\begin{displaymath}
    \mat{R}(\ten{X}) = \t{\mat{B}}(\mat{t}(\ten{X}) - \E\mat{t}(\ten{X}))
\end{displaymath}
is a sufficient reduction where $\mat{B}\in\mathbb{R}^{p(p + 1)\times q}$ with $\Span(\mat{B}) = \Span(\{\mat{\eta}_Y - \E_{Y}\mat{\eta}_Y : Y\in\mathcal{S}_Y\})$. With $\E_Y\mat{\eta}_{Y,1} \equiv c_1\E[\overline{\ten{\eta}}_1 - \ten{F}_Y\times_{k\in[r]}\mat{\alpha}_k] = c_1 \overline{\ten{\eta}}_1$ cause $\E_Y\ten{F}_Y = 0$ and $\mat{\eta}_{y,2}$ does not depend on $y$ (regardless of the notation) we get
\begin{displaymath}
    \mat{\eta}_Y - \E_{Y}\mat{\eta}_Y = \begin{pmatrix}
        \mat{\eta}_{Y,1} - \E_{Y}\mat{\eta}_{Y,1} \\
        \mat{\eta}_{Y,2} - \E_{Y}\mat{\eta}_{Y,2}
    \end{pmatrix} = \begin{pmatrix}
        c_1\vec(\ten{F}_Y\times_{k\in[r]}\mat{\alpha}_k) \\
        \mat{0}
    \end{pmatrix}.
\end{displaymath}
Noting that
\begin{displaymath}
    c_1\vec(\ten{F}_Y\times_{k\in[r]}\mat{\alpha}_k)
        = c_1\Big( \bigotimes_{k = r}^{1}\mat{\alpha}_k \Big)\vec(\ten{F}_Y)
\end{displaymath}
we get
\begin{displaymath}
    \mat{B} = \begin{pmatrix}
        c_1 \bigotimes_{k = r}^{1}\mat{\alpha}_k \\
        \mat{0}_{p^2\times q}
    \end{pmatrix}.
\end{displaymath}
Simplifying leads to
\begin{displaymath}
    \t{\mat{B}}(\mat{t}(\ten{X}) - \E\mat{t}(\ten{X})) = c_1 \Big( \bigotimes_{k = r}^{1}\mat{\alpha}_k \Big)(\mat{t}_1(\ten{X}) - \E\mat{t}_1(\ten{X})).
\end{displaymath}
Now note $\Span(\mat{A}) = \Span(c \mat{A})$ for any matrix $\mat{A}$ and non-zero scalar $c$ as well as the definition $\mat{t}_1(\ten{X}) = \vec{\ten{X}}$ which proves the following.

\begin{theorem}[SDR]\label{thm:sdr}
    A sufficient reduction for the regression $Y\mid \ten{X}$ under the quadratic exponential family inverse regression model \todo{reg} is given by
    \begin{align*}
        R(\ten{X})
            &= \t{\mat{\beta}}(\vec{\ten{X}} - \E\vec{\ten{X}}) \\
            &\equiv \ten{X}\times_{k\in[r]}\t{\mat{\alpha}_k}.
    \end{align*}
    for a $p\times q$ dimensional matrix $\mat{\beta}$ given by
    \begin{displaymath}
        \mat{\beta} = \bigotimes_{k = r}^{1}\mat{\alpha}_k
    \end{displaymath}
    which satisfies $\Span(\mat{\beta}) = \Span(\{\mat{\eta}_{Y,1} - \E_{Y}\mat{\eta}_{Y,1} : Y\in\mathcal{S}_Y\})$.
\end{theorem}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
                    \section{Special Distributions}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
We illustrate the SDR method on two special cases, first the Tensor Normal distribution and second on the Multi-Variate Bernoulli distribution with vector, matrix and tensor valued predictors.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Tensor Normal}

Let $\ten{X}, \ten{F}_y$ be order $r$ tensors of dimensions $p_1\times ... \times p_r$ and $q_1\times ... \times q_r$, respectively. We assume the inverse regression model for $\ten{X}\mid Y = y$ to be tensor normal distributed with density
\begin{displaymath}
    f_{\mat{\theta}_y}(\ten{X}\mid Y = y) = (2\pi)^{-p/2}\prod_{k = 1}^r |\mat{\Delta}_{k}|^{-p / 2 p_{k}}\exp\Big(
        -\frac{1}{2}\langle \ten{X} - \ten{\mu}_y, (\ten{X} - \ten{\mu}_y)\times_{k\in[r]}\mat{\Delta}_{k}^{-1} \rangle
    \Big)
\end{displaymath}
with location parameter tensor $\ten{\mu}_y$ depending on $y$ and the symmetric covariance matrices $\mat{\Delta}_{k}$ for each of the $k\in[r]$ modes (independent of $y$) collected in the parameter vector $\mat{\theta}_y = (\ten{\mu}_y, \mat{\Delta}_1, ..., \mat{\Delta}_r)$. Rewriting into the form of an quadratic exponential family leads to
\begin{align*}
    f_{\mat{\theta}_y}(\ten{X}\mid Y = y)
        &= (2\pi)^{-p/2} \exp\Big(
            -\frac{1}{2}\langle \ten{X}, \ten{X}\times_{k\in[r]}\mat{\Delta}_{k}^{-1} \rangle
            +\langle \ten{X}, \ten{\mu}_y\times_{k\in[r]}\mat{\Delta}_k^{-1} \rangle \\
            &\makebox[10em]{}-\frac{1}{2}\langle \ten{\mu}_y, \ten{\mu}_y\times_{k\in[r]}\mat{\Delta}_{k}^{-1} \rangle
            -\sum_{k = 1}^r \frac{p}{2 p_{k}}\log|\mat{\Delta}_k|
        \Big) \\
        &= h(\ten{X})\exp(\t{\mat{{\eta}}(\mat{\theta}_y)}\mat{t}(\ten{X}) - b(\mat{\theta}_y)).
\end{align*}
Identifying the exponential family components gives
\begin{align*}
    h(\ten{X}) &= (2\pi)^{-p/2} \\
    b(\mat{\theta}_y) &= \frac{1}{2}\langle \ten{\mu}_y, \ten{\mu}_y\times_{k\in[r]}\mat{\Delta}_{k}^{-1} \rangle + \sum_{k = 1}^r \frac{p}{2 p_{k}}\log|\mat{\Delta}_{k}|
\end{align*}
and
\begin{align*}
    \mat{\eta}(\mat{\theta}_y) &= (\mat{\eta}_1(\mat{\theta}_y); \mat{\eta}_2(\mat{\theta}_y)) &
    \mat{t}(\ten{X}) &= (\mat{t}_1(\ten{X}); \mat{t}_2(\ten{X}))
\end{align*}
where
\begin{align*}
    \mat{\eta}_1(\mat{\theta}_y) = \mat{\eta}_{y,1} &= \vec(\ten{\mu}_y\times_{k\in[r]}\mat{\Delta}_{k}^{-1}), &
    \mat{t}_1(\ten{X}) &= \vec\ten{X}, \\
    \mat{\eta}_2(\mat{\theta}_y) = \mat{\eta}_{y,2} &= -\frac{1}{2}\vec\bigotimes_{k = r}^{1}\mat{\Delta}_{k}^{-1}, &
    \mat{t}_2(\ten{X}) &= \vec\ten{X}\otimes\vec\ten{X}.
\end{align*}
The natural parameters are models as described in the Multi-Linear GLM as
\begin{align*}
    \mat{\eta}_{y,1} &= \vec(\overline{\ten{\eta}}_1 + \ten{F}_y\times_{k\in[r]}\mat{\alpha}_{k}) \\
    \mat{\eta}_{y,2} &= -\frac{1}{2}\vec\bigotimes_{k = r}^{1}\mat{\Omega}_{k}.
\end{align*}
The intercept parameter $\overline{\ten{\eta}}_1$ is of the same dimensions as $\ten{X}$ and the reduction matrices $\mat{\alpha}_j$ are of dimensions $p_j\times q_j$ while the symmetric $\mat{\Omega}_j$ are of dimensions $p_j\times p_j$. The inverse relation from the GLM parameters to the tensor normal parameters is
\begin{align*}
    \ten{\mu}_y &= (\overline{\ten{\eta}}_1 + \ten{F}_y\times_{j\in[r]}\mat{\alpha}_{j})\times_{k\in[r]}\mat{\Omega}_{k}^{-1} = (\unvec(-2\mat{\eta}_{y,2}))^{-1}\mat{\eta}_{y,1} \\
    \mat{\Delta}_{k} &= \mat{\Omega}_{k}^{-1}
\end{align*}
for each $j\in[r]$. The inverse link is given by
\begin{displaymath}
    \invlink(\mat{\eta}_y) = \E_{\mat{\theta}_y}[\mat{t}(\ten{X})\mid Y = y]
\end{displaymath}
consisting of the first and second (uncentered) vectorized moments of the tensor normal distribution.
\begin{align*}
    \invlink_1(\mat{\eta}_y) &\equiv \E[\ten{X} \mid Y = y] = \ten{\mu}_y \\
        &= (\ten{\eta}_1 + \ten{F}_y\times_{k\in[r]}\mat{\alpha}_k) \times_{l\in[k]}\mat{\Omega}_k^{-1} \\
    \invlink_2(\mat{\eta}_y) &\equiv \E[\vec(\ten{X})\t{\vec(\ten{X})} \mid Y = y] \\
        &= \cov(\vec{X} \mid Y = y) + \vec(\ten{\mu}_y)\t{\vec(\ten{\mu}_y)} \\
        &= \bigotimes_{k = r}^{1}\mat{\Omega}_k^{-1} + \vec(\ten{\mu}_y)\t{\vec(\ten{\mu}_y)}
\end{align*}

For estimation purposes it's also of interest to express the log-partition function $b$ in terms of the natural parameters or the GLM parameters which has the form
\begin{displaymath}
    b(\mat{\eta}_y) = \frac{1}{2}\t{\mat{\eta}_{y, 1}}(\unvec(-2\mat{\eta}_{y, 2}))^{-1} - \frac{1}{2}\log|\unvec(-2\mat{\eta}_{y, 2})|.
\end{displaymath}


Denote the Residuals as
\begin{displaymath}
    \ten{R}_i = \ten{X}_i - \ten{\mu}_{y_i}
\end{displaymath}
then with $\overline{\ten{R}} = \frac{1}{n}\sum_{i = 1}^n \ten{R}_i$ we get
\begin{align*}
    \nabla_{\overline{\eta}_1} l &= \overline{\ten{R}}, \\
    \nabla_{\mat{\alpha}_j} l &= \frac{1}{n}\ten{R}_{(j)}\t{(\ten{F}_y\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}}, \\
    \D l(\mat{\Omega}_j) &= \frac{1}{2}\t{\vec\Big(\frac{p}{p_j}\mat{\Omega}_j^{-1} - (\ten{X} + \mu_y)_{(j)}\t{(\ten{R}\times_{k\in[r]\backslash j}\mat{\Omega}_k)_{(j)}}\Big)}\mat{D}_{p_j}\t{\mat{D}_{p_j}}
\end{align*}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Initial Values}
First we set the gradient with respect to $\overline{\ten{\eta}}_1$ to zero
\begin{gather*}
    0 \overset{!}{=} \nabla_{\overline{\ten{\eta}}_1}l = c_1\sum_{i = 1}^n (\ten{X}_i - \ten{\mu}_i) \\
    \overline{\ten{X}} = (\overline{\ten{\eta}}_1 + \overline{\ten{F}_y}\times_{k\in[r]}\mat{\alpha}_{k})\times_{l\in[r]}\mat{\Omega}_{l}^{-1} \\
    \overline{\ten{X}}\times_{l\in[r]}\mat{\Omega}_{l} = \overline{\ten{\eta}}_1 + \overline{\ten{F}_y}\times_{k\in[r]}\mat{\alpha}_{k} \approx \overline{\ten{\eta}}_1 \\
    \overline{\ten{\eta}}_1^{(0)} = \overline{\ten{X}}\times_{k\in[r]}\mat{\Omega}_{k}^{(0)}
\end{gather*}
where the approximation is due to the assumption that $\E \ten{F}_y = 0$. For the initial values of the scatter matrices $\mat{\Omega}_{l}$ we simply ignore the relation to the response and simply estimate them as the marginal scatter matrices. These initial estimates overemphasize the variability in the reduction subspace. Therefore, we first compute the unscaled mode covariance estimates
\begin{displaymath}
    \widetilde{\mat{\Delta}}_j^{(0)} = \frac{p_j}{n p} (\ten{X} - \overline{\ten{X}})_{(j)}\t{(\ten{X} - \overline{\ten{X}})_{(j)}}.
\end{displaymath}
The next step is to scale them such that there Kronecker product has an appropriate trace
\begin{displaymath}
    \mat{\Delta}_j^{(0)} = \left(\frac{\|\ten{X} - \overline{\ten{X}}\|_F^2}{n \prod_{k = 1}^r \tr(\widetilde{\mat{\Delta}}_j^{(0)})}\right)^{1 / r} \widetilde{\mat{\Delta}}_j^{(0)}.
\end{displaymath}
Finally, the co-variances need to be inverted to give initial estimated of the scatter matrices
\begin{displaymath}
    \mat{\Omega}_j^{(0)} = (\mat{\Delta}_j^{(0)})^{-1}.
\end{displaymath}
The relay interesting part is to get initial estimates for the $\mat{\alpha}_j$ matrices. Setting the $\mat{\alpha}_j$ gradient to zero gives and substituting the initial estimates for $\overline{\ten{\eta}}_1$ and the $\mat{\Omega}_k$'s gives
\begin{gather*}
    0 \overset{!}{=} \nabla_{\mat{\alpha}_j}l = c_1 \sum_{i = 1}^n \reshape{\mat{p}}(\mat{t}_1(\ten{X}_i) - \mat{g}_1(\mat{\eta}_{y_i}))_{(j)}\t{(\ten{F}_{y_i}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}} \\
    (\ten{X} - \overline{\ten{X}})_{(j)}\t{(\ten{F}_y \times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}}
        = \mat{\Omega}_j^{(0)}\mat{\alpha}_j(\ten{F}_y\times_{k\in[r]\backslash j}\mat{\Omega}_k^{(0)}\mat{\alpha}_k)_{(j)}\t{(\ten{F}_y \times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}}
\end{gather*}
Now letting $\mat{\Sigma}_k$ be the mode co-variances of $\ten{F}_y$ and define $\ten{W}_y = \ten{F}_y\times_{k\in[r]}\mat{\Sigma}_k$ we get
\begin{gather*}
    (\ten{X} - \overline{\ten{X}})_{(j)}\t{(\ten{F}_y \times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}}
        = \mat{\Omega}_j^{(0)}\mat{\alpha}_j\mat{\Sigma}_j^{1/2}(\ten{W}_y\times_{k\in[r]\backslash j}\mat{\Omega}_k^{(0)}\mat{\alpha}_k\mat{\Sigma}_k^{1/2})_{(j)}\t{(\ten{W}_y \times_{k\in[r]\backslash j}\mat{\alpha}_k \mat{\Sigma}_{k}^{1/2})_{(j)}}\mat{\Sigma}_{j}^{1/2} \\
        = \mat{\Omega}_j^{(0)}\mat{\alpha}_j\mat{\Sigma}_j^{1/2}(\ten{W}_y)_{(j)}\Big(\mat{I}_n\otimes\bigotimes_{\substack{k = r\\k\neq j}}^{1}\mat{\Sigma}_k^{1/2}\t{\mat{\alpha}_k}\mat{\Omega}_k^{(0)}\mat{\alpha}_k\mat{\Sigma}_{k}^{1/2}\Big)\t{(\ten{W}_y)_{(j)}}\mat{\Sigma}_{j}^{1/2}.
\end{gather*}
Now we let $\mat{\alpha}_j^{(0)}$ be such that $\mat{\Sigma}_k^{1/2}\t{(\mat{\alpha}^{(0)}_k)}\mat{\Omega}_k^{(0)}\mat{\alpha}^{(0)}_k\mat{\Sigma}_{k}^{1/2} = \mat{I}_{p_j}$, which leads by substitution to
\begin{displaymath}
    (\ten{X} - \overline{\ten{X}})_{(j)}\t{(\ten{F}_y\times_{k\in[r]\backslash j}\mat{\alpha}^{(0)}_k)_{(j)}}
    = \mat{\Omega}_j^{(0)}\mat{\alpha}^{(0)}_j\mat{\Sigma}_j^{1/2}(\ten{W}_y)_{(j)}\t{(\ten{W}_y)_{(j)}}\mat{\Sigma}_{j}^{1/2}
    = \frac{p_j}{n p}\mat{\Omega}_j^{(0)}\mat{\alpha}^{(0)}_j\mat{\Sigma}_j
\end{displaymath}
\todo{Does this make sense?!?!?!}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Ising Model}
For the inverse regression $\ten{X}\mid Y = y$ the Ising model probability mass function with $p (p + 1) / 2$ parameters $\mat{\theta}_y$ is given by
\begin{align*}
    P_{\mat{\theta}_y}(\ten{X}\mid Y = y)
        &= p_0(\mat{\theta}_y)\exp(\t{\vech(\vec(\ten{X})\t{\vec(\ten{X})})}\mat{\theta}_y) \\
        &= h(\ten{X})\exp(\t{\mat{{\eta}}(\mat{\theta}_y)}\mat{t}(\ten{X}) - b(\mat{\theta}_y))
\end{align*}
where $h(\ten{X}) = 1$ and $b(\mat{\theta}_y) = -\log p_0(\mat{\theta}(\mat{\eta}_y))$.
According to the GLM model we get
\begin{align*}
    \mat{\eta}_{y,1} &\equiv c_1 (\overline{\ten{\eta}}_1 + \ten{F}_y\times_{k\in[r]}\mat{\alpha}_k), &
    \mat{\eta}_{y,2} &\equiv c_2 \bigotimes_{k = r}^{1}\mat{\Omega}_k.
\end{align*}
which yields the following relation to the conditional Ising model parameters
\begin{displaymath}
    \mat{\theta}_y = \mat{\theta}(\mat{\eta}_y) = \vech(\diag(\mat{\eta}_{y,1}) + (2_{p\times p} - \mat{I}_p) \odot \reshape{(p, p)}(\mat{\eta}_{y,2}))
\end{displaymath}
where $c_1, c_2$ are non-zero known (modeled) constants between $0$ and $1$ such that $c_1 + c_2 = 1$. The ``inverse'' link in then computed via the Ising model as the conditional expectation of all interactions
\begin{align*}
    \invlink_2(\mat{\eta}_y) \equiv \E_{\mat{\theta}_y}[\vec(\ten{X})\t{\vec(\ten{X})}\mid Y = y]
\end{align*}
which incorporates the first moment. In other words $\invlink_1(\mat{\eta}_y) = \diag(\E_{\mat{\theta}_y}[\vec(\ten{X})\t{\vec(\ten{X})}\mid Y = y])$.


% The ``inverse'' link is given by
% \begin{align*}
%     \invlink_1(\mat{\eta}_y) &\equiv \E_{\mat{\theta}(\mat{\eta}_y)}[\ten{X} | Y = y] \\
%     \invlink_2(\mat{\eta}_y) &\equiv \E_{\mat{\theta}(\mat{\eta}_y)}[\vec(\ten{X})\t{\vec(\ten{X})} | Y = y]
% \end{align*}
% and note that $\diag(\E_{\mat{\theta}(\mat{\eta}_y)}[\vec(\ten{X})\t{\vec(\ten{X})} | Y = y]) \equiv \E_{\mat{\theta}(\mat{\eta}_y)}[\ten{X} | Y = y]$.
%
% The gradients of the log-likelihood are now given by
% \begin{align*}
%     \nabla_{\overline{\ten{\eta}}_1} l
%         &= \frac{1}{n}\sum_{i = 1}^n \ten{R}_i \\
%     \nabla_{\mat{\alpha}_j} l
%         &= \frac{1}{n}\ten{R}_{(j)}\t{(\ten{F}_y\times_{k\in[r]\backslash j}\mat{\alpha}_j)_{(j)}} \\
%     \vec(\nabla_{\mat{\Omega}_j} l)
%         &= \t{\vec( (\reshape{(\mat{p}, \mat{p})}(\overline{\mat{t}_2(\ten{X}_i)} - \E[\mat{t}_2(\ten{X})\mid Y = y_i]))_{(j, r + j)} \vec\bigotimes_{\substack{k = r\\k\neq j}}^{1}\mat{\Omega}_j)}\mat{D}_{p_j}\t{\mat{D}_{p_j}}
% \end{align*}
% using the notation $\overline{\mat{t}_2(\ten{X})} = \frac{1}{n}\sum_{i = 1}^n \mat{t}_2(\ten{X}_i) = \frac{1}{n}\sum_{i = 1}^n \vec(\ten{X}_i)\otimes \vec(\ten{X}_i)$.

\printbibliography[heading=bibintoc,title={References}]
\appendix
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
                  \section{Vectorization and Matricization}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{displaymath}
    \vec(\ten{A}\times_{k\in[r]}\mat{B}_k) = \Big(\bigotimes_{k = r}^1 \mat{B}_k\Big)\vec\ten{A}
\end{displaymath}
\begin{displaymath}
    (\ten{A}\times_{k\in[r]}\mat{B}_k)_{(j)} = \mat{B}_j\ten{A}_{(j)}\bigotimes_{\substack{k = r\\k\neq j}}^1\t{\mat{B}_k}
\end{displaymath}
of which a special case is $(\ten{A}\times_{j}\mat{B}_j)_{(j)} = \mat{B}_j\ten{A}_{(j)}$.


Let $\ten{A}$ be a $n\times p_1\times ... \times p_r\times q_1\times ... \times q_r$ tensor and $\mat{B}_k$ be $p_k\times q_k$ matrices, then
\begin{displaymath}
    \ten{A}_{(1)} \vec{\bigotimes_{k = r}^{1}\mat{B}_k}
    =
    \Big(\ten{R}(\ten{A})\times_{\substack{k + 1\\k\in[r]}}\t{\vec(\mat{B}_k)}\Big)_{(1)}
\end{displaymath}
where $\ten{R}$ is a permutation of the axis and reshaping of the tensor $\ten{A}$. This axis permutation converts $n\times p_1\times ... \times p_r\times q_1\times ... \times q_r$ to $n\times p_1\times q_1 \times ... \times p_r\times q_r$ and the reshaping vectorizes the axis pairs $p_k\times q_k$ leading to a tensor $\ten{R}(\ten{A})$ of dimensions $n\times p_1 q_1\times ...\times p_r q_r$.

An alternative way to write this is for each of the $i\in[n]$ vector components is
\begin{displaymath}
    \Big(\ten{A}_{(1)}\vec{\bigotimes_{k = r}^{1}\mat{B}_k}\Big)_{i}
        = \sum_{J\in[(\mat{p}, \mat{q})]}
            \ten{A}_{i, J}\prod_{k = 1}^r (B_k)_{J_k, J_{k + r}}
\end{displaymath}
using the notation $J\in[(\mat{p}, \mat{q})] = [p_1]\times ... \times [p_r]\times [q_1]\times ... \times [q_r]$.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
                        \section{Pattern Matrices}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

The \emph{duplication matrix} $\mat{D}_p$ of dimensions $p^2\times p(p + 1) / 2$ is defined implicitly such that for any symmetric $p\times p$ matrix $\mat{A}$ holds
\begin{displaymath}
    \mat{D}_p\vech\mat{A} = \vec{\mat{A}}.
\end{displaymath}
Let $\mat{A}$ by a $p\times q$ matrix, then we denote the \emph{commutation matrix} $\mat{K}_{p,q}$ as the $p q\times p q$ matrix satisfying
\begin{displaymath}
    \mat{K}_{p,q}\vec\mat{A} = \vec{\t{\mat{A}}}.
\end{displaymath}
The identity giving the commutation matrix its name is
\begin{displaymath}
    \mat{A}\otimes\mat{B} = \mat{K}_{a_1,b_1}(\mat{B}\otimes\mat{A})\t{\mat{K}_{a_2,b_2}}.
\end{displaymath}
For a generalization of the commutation matrix let $\ten{A}$ be a $p_1\times ...\times p_r$ tensor of order $r$. Then the \emph{generalized commutation matrix} $\mat{K}_{(p_1, ..., p_r),(j)}$ is implicitly defined such that
\begin{displaymath}
    \mat{K}_{(p_1, ..., p_r),(j)}\vec{\ten{A}} = \vec{\ten{A}_{(j)}}
\end{displaymath}
for every $j \in[r]$ mode. This is a direct generalization of the commutation matrix with the special case $\mat{K}_{(p,q),(2)} = \mat{K}_{p,q}$ and the trivial case $\mat{K}_{(p_1, ..., p_r),(1)} = \mat{I}_{p}$ for $p = \prod_{j=1}^r p_j$. Furthermore, with a dimension vector $\mat{p} = (p_1, ..., p_r)$ its convenient to write $\mat{K}_{(p_1, ..., p_r),(j)}$ as $\mat{K}_{\mat{p},(j)}$. Its relation to the classic Commutation matrix is given by
\begin{displaymath}
    \mat{K}_{\mat{p}, (j)} = \mat{I}_{\overline{p}_j} \otimes \mat{K}_{\underline{p}_j, p_j}
\end{displaymath}
where $\overline{p}_j = \prod_{k = j + 1}^r p_k$ and $\underline{p}_j = \prod_{k = 1}^{j - 1}p_k$ with an empty product set to $1$.
The generalized commutation matrix gives leads to a generalization of the Kronecker product commutation identity
\begin{displaymath}
    \bigotimes_{\substack{k = r\\k\neq j}}^{1}\mat{A}_k\otimes \mat{A}_j = \mat{K}_{\mat{p}, (j)}\Big(\bigotimes_{k = r}^1 \mat{A}_k\Big)\t{\mat{K}_{\mat{q}, (j)}}
\end{displaymath}
for arbitrary matrices $\mat{A}_k$ of dimensions $p_k\times q_k$, $k \in[r]$ which are collected in the dimension vectors $\mat{p} = (p_1, ..., p_r)$ and $\mat{q} = (q_1, ..., q_r)$. Next the \emph{symmetrizer} $\mat{N}_p$ is a $p^2\times p^2$ matrix such that for any $p\times p$ matrix $\mat{A}$
\begin{displaymath}
    \mat{N}_p \vec{\mat{A}} = \frac{1}{2}(\vec{\mat{A}} + \vec{\t{\mat{A}}}).
\end{displaymath}
Another matrix which might come in handy is the \emph{selection matrix} $\mat{S}_p$ of dimensions $p^2\times p$ which selects the diagonal elements of a $p\times p$ matrix $\mat{A}$ from its vectorization
\begin{displaymath}
    \mat{S}_p\vec{\mat{A}} = \diag{\mat{A}}
\end{displaymath}
where $\diag{\mat{A}}$ denotes the vector of diagonal elements of $\mat{A}$.

For two matrices $\mat A$ of dimensions $a_1\times a_2$ and $\mat B$ of dimensions $b_1\times b_2$ holds
\begin{equation}\label{eq:vecKron}
    \vec(\mat A\otimes\mat B) = (\mat{I}_{a_2}\otimes\mat{K}_{b_2,a_1}\otimes\mat{I}_{b_1})(\vec\mat A\otimes\vec\mat B).
\end{equation}

\begin{align*}
    \pinv{\mat{D}_p} &= (\t{\mat{D}_p}\mat{D}_p)^{-1}\t{\mat{D}_p} \\
    \pinv{\mat{D}_p}\mat{D}_p &= \mat{I}_{p(p+1)/2} \\
    \mat{D}_p\pinv{\mat{D}_p} &= \mat{N}_{p} \\
    \t{\mat{K}_{p,q}} &= \mat{K}_{p,q}^{-1} = \mat{K}_{q,p} \\
    \t{\mat{K}_{\mat{p},(j)}} &= \mat{K}_{\mat{p},(j)}^{-1}
\end{align*}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
                          \section{Matrix Calculus}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{example}
    We want to find the derivative with respect to any of the $r$ symmetric $p_j\times p_j$ matrices $\mat{\Omega}_j$ where $j = 1, ..., r$ of the Kronecker product
    \begin{displaymath}
        \mat{F} = \bigotimes_{k = r}^1 \mat{\Omega}_k.
    \end{displaymath}
    Therefore, denote
    \begin{align*}
        p &= \prod_{k = 1}^r p_k, & \overline{p}_j &= \prod_{k = j + 1}^r p_k, & \underline{p}_j &= \prod_{k = 1}^{j - 1} p_k, \\
        & & \overline{\mat{\Omega}}_j &= \bigotimes_{k = r}^{j+1}\mat{\Omega}_k, & \underline{\mat{\Omega}}_j &= \bigotimes_{k = j - 1}^{1}\mat{\Omega}_k
    \end{align*}
    which slightly simplifies the following. With this notation we have $p = \overline{p}_jp_j\underline{p}_j$ for any of the $j = 1, ..., r$. Furthermore, the matrices $\overline{\mat{\Omega}}_j$ and $\underline{\mat{\Omega}}_j$ are of dimensions $\overline{p}_j\times \overline{p}_j$ and $\underline{p}_j\times \underline{p}_j$, respectively. We start with the differential
    \begin{align*}
        \d\mat{F} &= \d\bigotimes_{k = r}^1 \mat{\Omega}_k
            = \sum_{j = 1}^r \bigotimes_{k = r}^{j+1}\mat{\Omega}_k\otimes\d\mat{\Omega}_j\otimes\bigotimes_{k = j - 1}^{1}\mat{\Omega}_k
            = \sum_{j = 1}^r \overline{\mat{\Omega}}_j\otimes\d\mat{\Omega}_j\otimes\underline{\mat{\Omega}}_j \\
            &= \sum_{j = 1}^r \mat{K}_{\overline{p}_jp_j,\underline{p}_j}(\underline{\mat{\Omega}}_j\otimes\overline{\mat{\Omega}}_j\otimes\d\mat{\Omega}_j)\mat{K}_{\underline{p}_j,\overline{p}_jp_j}
    \end{align*}
    By vectorizing this transforms to
    \begin{align*}
        \d\vec\mat{F} &= \sum_{j = 1}^r (\mat{K}_{\overline{p}_jp_j,\underline{p}_j}\otimes\mat{K}_{\overline{p}_jp_j,\underline{p}_j})\vec(\underline{\mat{\Omega}}_j\otimes\overline{\mat{\Omega}}_j\otimes\d\mat{\Omega}_j) \\
            &= \sum_{j = 1}^r (\mat{K}_{\overline{p}_jp_j,\underline{p}_j}\otimes\mat{K}_{\overline{p}_jp_j,\underline{p}_j})(\mat{I}_{\overline{p}_j\underline{p}_j}\otimes\mat{K}_{p_j,\overline{p}_j\underline{p}_j}\otimes\mat{I}_{p_j})(\vec(\underline{\mat{\Omega}}_j\otimes\overline{\mat{\Omega}}_j)\otimes\d\vec\mat{\Omega}_j) \\
            &= \sum_{j = 1}^r (\mat{K}_{\overline{p}_jp_j,\underline{p}_j}\otimes\mat{K}_{\overline{p}_jp_j,\underline{p}_j})(\mat{I}_{\overline{p}_j\underline{p}_j}\otimes\mat{K}_{p_j,\overline{p}_j\underline{p}_j}\otimes\mat{I}_{p_j})(\vec(\underline{\mat{\Omega}}_j\otimes\overline{\mat{\Omega}}_j)\otimes\mat{I}_{p_j^2})\,\d\vec\mat{\Omega}_j \\
    \end{align*}
    leading to
    \begin{displaymath}
        \D\mat{F}(\mat{\Omega}_j) = (\mat{K}_{\overline{p}_jp_j,\underline{p}_j}\otimes\mat{K}_{\overline{p}_jp_j,\underline{p}_j})(\mat{I}_{\overline{p}_j\underline{p}_j}\otimes\mat{K}_{p_j,\overline{p}_j\underline{p}_j}\otimes\mat{I}_{p_j})(\vec(\underline{\mat{\Omega}}_j\otimes\overline{\mat{\Omega}}_j)\otimes\mat{I}_{p_j^2})
    \end{displaymath}
    for each $j = 1, ..., r$. Note that the $p^2\times p^2$ matrices
    \begin{displaymath}
        \mat{P}_j = (\mat{K}_{\overline{p}_jp_j,\underline{p}_j}\otimes\mat{K}_{\overline{p}_jp_j,\underline{p}_j})(\mat{I}_{\overline{p}_j\underline{p}_j}\otimes\mat{K}_{p_j,\overline{p}_j\underline{p}_j}\otimes\mat{I}_{p_j})
    \end{displaymath}
    are permutations.
\end{example}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
                                \section{Stuff}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Let $X, Y$ be $p, q$ dimensional random variables, respectively. Furthermore, let $\E X = \mu_X$, $\E Y = \mu_Y$ as well as $\cov(X) = \mat{\Sigma}_X$ and $\cov(Y) = \mat{\Sigma}_Y$. Then define the standardized random variables $Z_X = \mat{\Sigma}_X^{-1/2}(X - \mu_X)$ and $Z_Y = \mat{\Sigma}_Y^{-1/2}(Y - \mu_Y)$. For the standardized variables holds $\E Z_X = 0_p$, $\E_Y = 0_q$ and for the co-variances we get $\cov(Z_X) = \mat{I}_p$ as well as $\cov(Z_Y) = \mat{I}_q$. Now we take a look at the cross-covariance between $X$ and $Y$
\begin{displaymath}
    \cov(X, Y)
        = \cov(X - \mu_X, Z - \mu_Z)
        = \cov(\mat{\Sigma}_X^{1/2} Z_X, \mat{\Sigma}_Y^{1/2} Z_Y)
        = \mat{\Sigma}_X^{1/2}\cov(Z_X, Z_Y)\mat{\Sigma}_Y^{1/2}.
\end{displaymath}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
                                \section{Proofs}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{proof}[Proof of Theorem~\ref{thm:sdr}]
    abc
\end{proof}

\end{document}