tensor_predictors/LaTeX/paper.tex

1715 lines
162 KiB
TeX

\documentclass[a4paper, 10pt]{article}
\usepackage[utf8]{inputenc}
\usepackage[LSF, T1]{fontenc}
\usepackage{chessfss}
\usepackage{fullpage}
\usepackage{amsmath, amssymb, amstext, amsthm, scalerel, bm, pifont}
\usepackage[dvipsnames]{xcolor} % dvipsnames loads more named colors
\usepackage{graphicx} % colors, images and graphics
\usepackage{tikz} % TikZ (TeX ist kein Zeichenprogramm)
\usepackage{environ} % for dynamic TikZ picture scaling
\usepackage{algorithm, algpseudocode} % Pseudo Codes / Algorithms
\usepackage[
style = apa, % citation style
isbn = false, % show isbn?
maxbibnames = 50, % maximal number of names in bibilography
maxcitenames = 2, % maximal number of names in text before et al.
autocite = inline, % look for autocite \autocite (inline = \parencite)
block = space, % small horizontal space between fields
backrefstyle = three+, % summarise pages, e.g. p. 2f, 6ff, 7-10
date = year, % date format
% backend = biber,
giveninits = true, % abbreviate first name
clearlang = false, % show language?
uniquename = init,
natbib = false,
dashed = true,
url = false,
doi = false,
bibencoding = utf8
]{biblatex}
\usepackage[pdftex, colorlinks, allcolors=blue]{hyperref} % Load as last package! Redefines commands
\usepackage[noabbrev, capitalize, nameinlink]{cleveref} % but this after hyperref
\usetikzlibrary{calc, perspective, datavisualization}
\setcounter{MaxMatrixCols}{20} % Sets the max nr. of columns in AMSmath's matrix envs to 20
% Document meta into
\title{GMLM Paper}
\author{Daniel Kapla}
\date{\today}
% Set PDF title, author and creator.
\AtBeginDocument{
\hypersetup{
pdftitle = {GMLM Paper},
pdfauthor = {Daniel Kapla},
pdfcreator = {\pdftexbanner}
}
}
% Bibliography resource(s)
\addbibresource{main.bib}
% Setup environments
% Theorem, Lemma
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{corollary}{Corollary}
\newtheorem{proposition}{Proposition}
\newtheorem{example}{Example}
% Definition, Condition
\theoremstyle{definition}
\newtheorem{definition}{Definition}
\newtheorem{condition}{Condition}
% Remark
\theoremstyle{remark}
\newtheorem{remark}{Remark}
\crefalias{section}{appendix} % ???
\crefname{condition}{Condition}{Conditions}
\Crefname{condition}{Condition}{Conditions}
% \creflabelformat{condition}{#2{\color{blue}(#1)}#3}
\crefrangelabelformat{condition}{#3#1#4-#5#2#6}
% % add (not included by default) clever-ref reference forms
% % \crefname{<name>}{<singular-form>}{<plural-form>}
% \crefname{definition}{definition}{definitions}
% matrices
\newcommand*{\mat}[1]{\boldsymbol{#1}}
% tensors (special case for lower case caligraphic letters)
\newcommand*{\ten}[1]{
\ifnum\pdfstrcmp{#1}{`}=1 % lowercase argument
\mathfrak{#1}
\else % uppercase argument
\mathcal{#1}
\fi
}
% set, a collection of elements
\newcommand{\manifold}[1]{\mathfrak{#1}}
% ternary operator (C style argments: <condition> ? <val_if_true> : <val_if_false>)
\newcommand{\ternary}[3]{{#2}{\ \mathrm{if}\ }{#1}{\ \mathrm{else}\ }{#3}}
% Define math macros
\renewcommand{\hat}{\widehat}
% \newcommand*{\ten}[1]{\mathcal{#1}}
\DeclareMathOperator{\sym}{sym}
\renewcommand*{\vec}{\operatorname{vec}}
\newcommand*{\unvec}{\operatorname{vec^{-1}}}
\newcommand*{\reshape}[1]{\operatorname{reshape}_{#1}}
\newcommand*{\vech}{\operatorname{vech}}
\newcommand*{\rank}{\operatorname{rank}}
\newcommand*{\diag}{\operatorname{diag}}
\newcommand*{\perm}[1]{\mathfrak{S}_{#1}} % set of permutations of size #1
\newcommand*{\len}[1]{\#{#1}} % length of #1
\DeclareMathOperator*{\ttt}{\circledast}
\DeclareMathOperator{\tr}{tr}
\DeclareMathOperator{\var}{Var}
\DeclareMathOperator{\cov}{Cov}
\DeclareMathOperator{\Span}{span}
\DeclareMathOperator{\E}{\operatorname{\mathbb{E}}}
% \DeclareMathOperator{\independent}{{\bot\!\!\!\bot}}
\DeclareMathOperator*{\argmin}{{arg\,min}}
\DeclareMathOperator*{\argmax}{{arg\,max}}
\newcommand*{\D}{\textnormal{D}} % derivative
\renewcommand*{\H}{\textnormal{H}} % hessian
\renewcommand*{\d}{\textnormal{d}} % differential
\renewcommand*{\t}[1]{{#1^{T}}} % matrix transpose
\newcommand*{\pinv}[1]{{#1^{\dagger}}} % `Moore-Penrose pseudoinverse`
\newcommand*{\K}{\mathcal{K}} % rearangment operator, generalization of Van-Loan and Pitzianis rearrangement opreration
% \let\checkmarkCopy\checkmark % see: https://tex.stackexchange.com/questions/47351/can-i-redefine-a-command-to-contain-itself
\renewcommand{\checkmark}{{\color{Green}\ding{51}}}
\newcommand{\xmark}{{\color{Red!70}\ding{55}}}
% Pseudo Code Commands
\newcommand{\algorithmicbreak}{\textbf{break}}
\newcommand{\Break}{\State \algorithmicbreak}
% Special Matrix Sets (Manifolds)
\newcommand{\MatMani}[2]{\mathbb{R}^{{#1}\times {#2}}}
\newcommand{\StiefelNonCompact}[2]{\mathbb{R}_{*}^{{#1}\times {#2}}}
\newcommand{\Stiefel}[2]{\mathrm{St}^{{#1}\times {#2}}}
\newcommand{\MatRankMani}[3]{\mathbb{R}_{\rank={#1}}^{{#2}\times {#3}}}
\newcommand{\DiagZeroMat}[1]{\mathbb{R}_{\diag=0}^{{#1}\times {#1}}}
\newcommand{\SymMat}[1]{\mathrm{Sym}^{{#1}\times {#1}}}
\newcommand{\SkewSymMat}[1]{\mathrm{Skew}^{{#1}\times {#1}}}
\newcommand{\SymPosDefMat}[1]{\mathrm{Sym}_{++}^{{#1}\times {#1}}}
\newcommand{\SymDiagZeroMat}[1]{\mathrm{Sym}_{\diag=0}^{p\times p}}
\newcommand{\SymBand}[2]{\mathrm{SymBand}^{{#1}\times {#1}}_{#2}}
\newcommand{\UnitaryGrp}[1]{\mathrm{U}(#1)}
\newcommand{\SpecialUnitaryGrp}[1]{\mathrm{SU}(#1)}
\newcommand{\todo}[1]{{\color{red}TODO: #1}}
\newcommand{\efi}[1]{{\color{blue}Effie: #1}}
% \newcommand{\todo}[1]{}
% \newcommand{\efi}[1]{}
%%% Custom operators with ether one or two arguments (limits)
\makeatletter
%%% Multi-Linear Multiplication
% $\mlm_{k \in [r]}$ or $\mlm_{k = 1}^{r}$ (lower limit MUST be the first!)
% Save first argument as \arg@one
\def\mlm_#1{\def\arg@one{#1}\futurelet\next\mlm@i}
% Check for second argument
\def\mlm@i{\ifx\next^\expandafter\mlm@two\else\expandafter\mlm@one\fi}
% specialization for one or two arguments, both versions use saved first argument
\def\mlm@one{\mathchoice%
{\operatorname*{\scalerel*[1.65em]{\times}{\bigotimes}}_{\makebox[0pt][c]{$\scriptstyle \arg@one$}}}%
{\operatorname*{\scalerel*[1.65em]{\times}{\bigotimes}}_{\arg@one}}%
{\operatorname*{\scalerel*[1.65em]{\times}{\bigotimes}}_{\arg@one}}%
{\operatorname*{\scalerel*[1.65em]{\times}{\bigotimes}}_{\arg@one}}%
}
% this commands single argument is the second argument of \mlm, it gobbles the `^`
\def\mlm@two^#1{\mathchoice%
{\operatorname*{\scalerel*[1.65em]{\times}{\bigotimes}}_{\makebox[0pt][c]{$\scriptstyle \arg@one$}}^{\makebox[0pt][c]{$\scriptstyle #1$}}}%
{\operatorname*{\scalerel*[1.65em]{\times}{\bigotimes}}_{\arg@one}^{#1}}%
{\operatorname*{\scalerel*[1.65em]{\times}{\bigotimes}}_{\arg@one}^{#1}}%
{\operatorname*{\scalerel*[1.65em]{\times}{\bigotimes}}_{\arg@one}^{#1}}%
}
%%% Big Circle (Iterated Outer Product)
\def\bigouter_#1{\def\arg@one{#1}\futurelet\next\bigouter@i}
\def\bigouter@i{\ifx\next^\expandafter\bigouter@two\else\expandafter\bigouter@one\fi}
\def\bigouter@one{\mathchoice%
{\operatorname*{\scalerel*{\circ}{\bigotimes}}_{\makebox[0pt][c]{$\scriptstyle \arg@one$}}}%
{\operatorname*{\scalerel*{\circ}{\bigotimes}}_{\arg@one}}%
{\operatorname*{\scalerel*{\circ}{\bigotimes}}_{\arg@one}}%
{\operatorname*{\scalerel*{\circ}{\bigotimes}}_{\arg@one}}%
}
\def\bigouter@two^#1{\mathchoice%
{\operatorname*{\scalerel*{\circ}{\bigotimes}}_{\makebox[0pt][c]{$\scriptstyle \arg@one$}}^{\makebox[0pt][c]{$\scriptstyle #1$}}}%
{\operatorname*{\scalerel*{\circ}{\bigotimes}}_{\arg@one}^{#1}}%
{\operatorname*{\scalerel*{\circ}{\bigotimes}}_{\arg@one}^{#1}}%
{\operatorname*{\scalerel*{\circ}{\bigotimes}}_{\arg@one}^{#1}}%
}
%%% Big Kronecker Product (with overflowing limits)
% Save first argument as \arg@one
\def\bigkron_#1{\def\arg@one{#1}\futurelet\next\bigkron@i}
% Check for second argument
\def\bigkron@i{\ifx\next^\expandafter\bigkron@two\else\expandafter\bigkron@one\fi}
% specialization for one or two arguments, both versions use saved first argument
\def\bigkron@one{\mathchoice%
{\bigotimes_{\makebox[0pt][c]{$\scriptstyle \arg@one$}}}%
{\bigotimes_{\arg@one}}%
{\bigotimes_{\arg@one}}%
{\bigotimes_{\arg@one}}%
}
% this commands single argument is the second argument of \bigkron
\def\bigkron@two^#1{\mathchoice%
{\bigotimes_{\makebox[0pt][c]{$\scriptstyle \arg@one$}}^{\makebox[0pt][c]{$\scriptstyle #1$}}}%
{\bigotimes_{\arg@one}^{#1}}%
{\bigotimes_{\arg@one}^{#1}}%
{\bigotimes_{\arg@one}^{#1}}%
}
\makeatother
%%% Scaling TikZ pictures using the `environ' package
% see: https://tex.stackexchange.com/questions/6388/how-to-scale-a-tikzpicture-to-textwidth
\makeatletter
\newsavebox{\measure@tikzpicture}
\NewEnviron{scaletikzpicturetowidth}[1]{%
\def\tikz@width{#1}%
\def\tikzscale{1}\begin{lrbox}{\measure@tikzpicture}%
\BODY
\end{lrbox}%
\pgfmathparse{#1/\wd\measure@tikzpicture}%
\edef\tikzscale{\pgfmathresult}%
\BODY
}
\makeatother
\newcommand{\smoothFunc}[2][\infty]{{C^{#2}(#1)}}
\newcommand{\localSmoothFunc}[3][\infty]{{C^{#3}_{#1}(#2)}}
\newcommand{\tangentSpace}[2]{\ensuremath{T_{#1}{#2}}}
\newcommand{\tangentBundle}[1]{\ensuremath{T{#1}}}
\newcommand{\cotangentSpace}[2]{\ensuremath{T^{*}_{#1}{#2}}}
\newcommand{\cotangentBundle}[1]{\ensuremath{T^{*}{#1}}}
\newcommand{\vectorField}[1]{{\mathfrak{X}(#1)}}
\newcommand{\Exp}{\operatorname{Exp}}
\newcommand{\coords}[1]{\underline{#1}}
\newcommand{\grad}{\operatorname{grad}}
\newcommand{\hess}{\operatorname{hess}}
\newcommand{\BigO}{\mathcal{O}}
\newcommand{\smallo}{o}
\begin{document}
%\tableofcontents
%------------------------------------------------------------------------------%
\section{Introduction}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Notation}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
We start by introducing the notation we use throughout the paper. Let $\ten{A} = (\ten{A}_{i_1, \ldots, i_r})\in\mathbb{R}^{q_1\times \ldots\times q_r}$ be an order\footnote{Also referred to as rank, therefore the variable name $r$, but this term is \emph{not} used as it leads to confusion with the rank as in ``the rank of a matrix''.} $r$ tensor, where $r\in\mathbb{N}$ is the number of modes or axes (dimensions) of $\ten{A}$ and $\ten{A}_{i_1,...,i_r} \in \mathbb{R}$ is its $(i_1, \ldots, i_r)$th entry. For example, a $p \times q$ matrix $\mat{B}$ has two modes, the rows and columns. For matrices $\mat{B}_k\in\mathbb{R}^{p_k\times q_k}$, $k\in[r] = \{1, 2, \ldots, r\}$, the \emph{multi-linear multiplication}, or \emph{Tucker Operator} \parencite{MultilinearOperators-Kolda2006}, is defined element wise as
\begin{displaymath}
(\ten{A}\times\{\mat{B}_1, \ldots, \mat{B}_r\})_{j_1, \ldots, j_r} = \sum_{i_1, \ldots, i_r = 1}^{q_1, \ldots, q_r} \ten{A}_{i_1, \ldots, i_r}(\mat{B}_{1})_{j_1, i_1} \cdots (\mat{B}_{r})_{j_r, i_r}
\end{displaymath}
which results in an order $r$ tensor of dimension $p_1\times ...\times p_k$. This results in the \emph{$k$-mode product} between the tensor $\ten{A}$ with the matrix $\mat{B}_k$,
\begin{displaymath}
\ten{A}\times_k\mat{B}_k = \ten{A}\times\{\mat{I}_{q_1}, \ldots, \mat{I}_{q_{k-1}}, \mat{B}_{k}, \mat{I}_{q_{k+1}}, \ldots, \mat{I}_{q_r}\}.
\end{displaymath}
Furthermore, the notation $\ten{A}\mlm_{k\in S}\mat{B}_k$ is short hand for the iterative application of the mode product for all indices in $S\subseteq[r]$. For example $\ten{A}\times_{k\in\{2, 5\}}\mat{B}_k = \ten{A}\times_2\mat{B}_2\times_5\mat{B}_5$. By only allowing $S$ to be a set, this notation is unambiguous, because the mode product commutes for different modes: $j\neq k\Rightarrow\ten{A}\times_j\mat{B}_j\times_k\mat{B}_k = \ten{A}\times_k\mat{B}_k\times_j\mat{B}_j$.
%Matrices and tensors can be \emph{vectorized} by the \emph{vectorization} operator $\vec$.
The operator $\vec$ maps an array to a vector. For example, $\vec(\mat{B})$ stands for the $pq \times 1$ vector of the $p \times q$ matrix $\mat{B}$ resulting from stacking the columns of $\mat{B}$ one after the other. For a tensor $\ten{A}= (a_{i_1,\ldots,i_r})$ of order $r$ and dimensions $q_1, \ldots, q_r$, $\vec(\ten{A})$ is the $q_1 q_2 \ldots q_r \times 1$ vector with the elements of $\ten{A}$ stacked one after the other in the specified order $r$ then $r-1$, and so on. For example, if $\ten{A}$ is 3-dimensional array, $\vec(\ten{A})=(\vec(\ten{A}(:,:,1))^T,\vec(\ten{A}(:,:,2)^T,\ldots,\vec(\ten{A}(:,:,q_r)^T)^T$. We use the notation $\ten{A}\equiv \ten{B}$ for objects $\ten{A}, \ten{B}$ of any shape if and only if $\vec(\ten{A}) = \vec(\ten{B})$.
The \emph{inner product} between two tensors of the same order and dimensions is
\begin{displaymath}
\langle\ten{A}, \ten{B}\rangle = \sum_{i_1, \ldots, i_r} \ten{A}_{i_1, \ldots, i_r}\ten{B}_{i_1, \ldots, i_r}
\end{displaymath}
that leads to the definition of the \emph{Frobenius Norm} for tensors, $\|\ten{A}\|_F = \sqrt{\langle\ten{A}, \ten{A}\rangle}$ and is the straightforward extension of the Frobenius norm for matrices and vectors. %are also used for matrices while for a vector $\mat{a}$ the \emph{2 norm} is $\|\mat{a}\|_2 = \sqrt{\langle\mat{a}, \mat{a}\rangle}$.
The \emph{outer product} between two tensors $\ten{A}$ of dimensions $q_1, \ldots, q_r$ and $\ten{B}$ of dimensions $p_1, \ldots, p_l$ is a tensor $\ten{A}\circ\ten{B}$ of order $r + l$ and dimensions $q_1, \ldots, q_r, p_1, \ldots, p_l$ such that
\begin{displaymath}
\ten{A}\circ\ten{B} \equiv (\vec\ten{A})\t{(\vec{\ten{B}})}.
\end{displaymath}
Let $\K : \mathbb{R}^{q_1, ..., q_{2 r}}\to\mathbb{R}^{q_1 q_{r + 1}, ..., q_r q_{2 r}}$ be defined element wise with indices $1\leq i_j + 1\leq q_j q_{r + j}$ for $j = 1, ..., r$ as
\begin{align*}
\K(\ten{A})_{i_1 + 1, ..., i_r + 1} &= \ten{A}_{\lfloor i_1 / q_{r + 1}\rfloor + 1, ..., \lfloor i_r / q_{2 r} \rfloor + 1, (i_1\operatorname{mod}q_{r + 1}) + 1, ..., (i_r\operatorname{mod}q_{2 r}) + 1}
\end{align*}
where $\lfloor\,.\,\rfloor$ is the floor operation and $a\operatorname{mod}b$ is the integer divition remainder of $a / b$. The mapping $\K$ is a linear operation and maps an order $2 r$ tensor to an order $r$ tensor by reshaping and permuting its elements. This operation allows to define a generalization of the \emph{Kronecker product} which we define as $\ten{A}\otimes\ten{B} = \K(\ten{A}\circ\ten{B})$.
For tensors of order at least $2$, the \emph{flattening} (or \emph{unfolding} or \emph{matricization}) is a reshaping of the tensor into a matrix along a particular mode. For a tensor $\ten{A}$ of order $r$ and dimensions $q_1, \ldots, q_r$, the $k$-mode unfolding $\ten{A}_{(k)}$ is a $q_k\times \prod_{l=1, l\neq k}q_l$ matrix with %For the tensor $\ten{A} = (a_{i_1,\ldots,i_r})\in\mathbb{R}^{q_1, \ldots, q_r}$ the
elements %of the $k$ unfolded tensor $\ten{A}_{(k)}$ are
\begin{displaymath}
(\ten{A}_{(k)})_{i_k, j} = \ten{A}_{i_1, \ldots, i_r}\quad\text{ with }\quad j = 1 + \sum_{\substack{l = 1\\l \neq k}}^r (i_l - 1) \prod_{\substack{m = 1\\m\neq k}}^{l - 1}q_m.
\end{displaymath}
% The rank of a tensor $\ten{A}$ of dimensions $q_1\times ...\times q_r$ is vector-valued; that is, $\rank(\ten{A}) = (a_1, \ldots, a_r)\in[q_1]\times...\times[q_r]$, where $a_k = \rank(\ten{A}_{(k)})$ is the usual matrix rank of the $k$ unfolded tensor.
The gradient of a function $\ten{F}(\ten{X})$ of any shape, univariate, multivariate or tensor valued, with argument $\ten{X}$ of any shape is defined as
\begin{displaymath}
\nabla_{\ten{X}}\ten{F} = \frac{\partial\t{(\vec\ten{F}(\ten{X}))}}{\partial(\vec\ten{X})}
\end{displaymath}
which is a matrix of dimension $p\times q$ where the vectorized quantities $\vec{\ten{X}}\in\mathbb{R}^p$ and $\vec\ten{F}(\ten{X})\in\mathbb{R}^q$. This is consistent with the gradient of a real-valued function $f(\mat{x})$ where $\mat{x}\in\mathbb{R}^p$ as $\nabla_{\mat{x}}f\in\mathbb{R}^{p\times 1}$. \todo{Maybe reference magnus and abadir, magnus and neudecker?!}
\todo{$\vech\ten{A}$ the half vectorization! Define via the vector containing the tensor elements with indices in \emph{reflected lexicographic order}? Also don't forget to figure out how to (if even) to define the half vectorization of a tensor (with all modes of the same dimensions)}
\todo{$\sym{\ten{A}}$ tensor needed?!}
\todo{I think the following examples are a good idea for the appendix.}
\begin{example}[Vectorization]\label{ex:vectorization}
Given a matrix
\begin{displaymath}
\mat{A} = \begin{pmatrix}
1 & 4 & 7 \\
2 & 5 & 8 \\
3 & 6 & 9
\end{pmatrix}
\end{displaymath}
its vectorization is $\vec{\mat{A}} = \t{(1, 2, 3, 4, 5, 6, 7, 8, 9)}$ and its half vectorization $\vech{\mat{A}} = \t{(1, 2, 3, 5, 6, 9)}$. Let a $\ten{A}$ be a tensor of dimension $3\times 3\times 3$ given by
\begin{displaymath}
\ten{A}_{:,:,1} = \begin{pmatrix}
1 & 4 & 7 \\
2 & 5 & 8 \\
3 & 6 & 9
\end{pmatrix},
\qquad
\ten{A}_{:,:,2} = \begin{pmatrix}
10 & 13 & 16 \\
11 & 14 & 17 \\
12 & 15 & 18
\end{pmatrix},
\qquad
\ten{A}_{:,:,3} = \begin{pmatrix}
19 & 22 & 25 \\
20 & 23 & 26 \\
21 & 24 & 27
\end{pmatrix}.
\end{displaymath}
Then the vectorization of $\ten{A}$ is given by
\begin{displaymath}
\vec{\ten{A}} = \t{(1, 2, 3, 4, ..., 26, 27)}\in\mathbb{R}^{27}
\end{displaymath}
while the half vectorization is
\begin{displaymath}
\vech{\ten{A}} = \t{(1, 2, 3, 5, 6, 9, 11, 12, 15, 21)}\in\mathbb{R}^{10}.
\end{displaymath}
\end{example}
\begin{example}[Matricization]
Let $\ten{A}$ be the $3\times 4\times 2$ tensor given by
\begin{displaymath}
\ten{A}_{:,:,1} = \begin{pmatrix}
1 & 4 & 7 & 10 \\
2 & 5 & 8 & 11 \\
3 & 6 & 9 & 12
\end{pmatrix},
\ten{A}_{:,:,2} = \begin{pmatrix}
13 & 16 & 19 & 22 \\
14 & 17 & 20 & 23 \\
15 & 18 & 21 & 24
\end{pmatrix}.
\end{displaymath}
Its matricizations are then
\begin{gather*}
\ten{A}_{(1)} = \begin{pmatrix}
1 & 4 & 7 & 10 & 13 & 16 & 19 & 22 \\
2 & 5 & 8 & 11 & 14 & 17 & 20 & 23 \\
3 & 6 & 9 & 12 & 15 & 18 & 21 & 24
\end{pmatrix},
\qquad
\ten{A}_{(2)} = \begin{pmatrix}
1 & 2 & 3 & 13 & 14 & 15 \\
4 & 5 & 6 & 16 & 17 & 18 \\
7 & 8 & 9 & 19 & 20 & 21 \\
10 & 11 & 12 & 22 & 23 & 24
\end{pmatrix}, \\
\ten{A}_{(3)} = \begin{pmatrix}
1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10 & 11 & 12 \\
13 & 14 & 15 & 16 & 17 & 18 & 19 & 20 & 21 & 22 & 23 & 24
\end{pmatrix}.
\end{gather*}
\end{example}
% \begin{example}[Symmetrization]
% Let $\ten{A}$ be the $3\times 3\times 3$ tensor from \cref{ex:vectorization}, then the symmetrization of $\ten{A}$ is
% \begin{align*}
% (\sym{\ten{A}})_{:,:,1} &= \frac{1}{6}\begin{pmatrix}
% 6 & 32 & 58 \\
% 32 & 58 & 84 \\
% 58 & 84 & 110
% \end{pmatrix}, \\
% (\sym{\ten{A}})_{:,:,2} &= \frac{1}{6}\begin{pmatrix}
% 32 & 58 & 84 \\
% 58 & 84 & 110 \\
% 84 & 110 & 136
% \end{pmatrix}, \\
% (\sym{\ten{A}})_{:,:,3} &= \frac{1}{6}\begin{pmatrix}
% 58 & 84 & 110 \\
% 84 & 110 & 136 \\
% 110 & 136 & 162
% \end{pmatrix}.
% \end{align*}
% \end{example}
% \begin{example}[Half Vectorization]
% The half vectorization of a square matrix
% \begin{displaymath}
% \mat{A} = \begin{pmatrix}
% 1 & 4 & 7 \\
% 2 & 5 & 8 \\
% 3 & 6 & 9
% \end{pmatrix}
% \end{displaymath}
% is
% \begin{displaymath}
% \vech{\mat{A}} = (1, 2, 3, 5, 6, 9).
% \end{displaymath}
% \end{example}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Problem Formulation}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Our goal is to identify the cumulative distribution function (cdf) $F$ of $Y\mid \ten{X}$, where $\ten{X}$ is assumed to admit $r$-tensor structure of dimension $p_1\times ... \times p_r$ with continuous or discrete entries and there is no constraint in the form of $Y$. The predictor $\ten{X}$ is a complex object; to simplify the problem we assume their exists a function $\ten{R}:\ten{X}\mapsto \ten{R}(\ten{X})$ such that $\ten{R}(\ten{X})$ is tensor valued of lower dimension so that
\begin{displaymath}
F(Y\mid \ten{X}) = F(Y\mid \ten{R}(\ten{X})).
\end{displaymath}
Since $\ten{R}(\ten{X})$ replaces the predictors without any effect in the conditional cdf of $Y\mid \ten{X}$, it is a \emph{sufficient reduction} for the regression $Y\mid\ten{X}$. We assume the tensor valued $\ten{R}(\ten{X})$ has dimension $q_1\times...\times q_r$ with $q_j\leq p_j$, $j = 1, ..., r$, which represents a dimension reduction along every mode of $\ten{X}$. This formulation is flexible as it allows, for example, to select ``important'' modes by reducing ``unimportant'' modes to be $1$ dimensional.
To find such a reduction $\ten{R}$, we leverage the equivalence pointed out in \textcite{FisherLectures-Cook2007},
\begin{equation}\label{eq:inverse-regression-sdr}
Y\mid\ten{X} \sim Y\mid \ten{R}(\ten{X})
\quad\Longleftrightarrow\quad
\ten{X}\mid(Y, \ten{R}(\ten{X})) \sim \ten{X}\mid\ten{R}(\ten{X}),
\end{equation}
which means that a \textit{sufficient statistic} $\ten{R}(\ten{X})$ for $Y$ in the inverse regression $\ten{X}\mid Y$, where $Y$ is considered as a parameter indexing the model, is also a sufficient reduction for $\ten{X}$ in the forward regression $Y\mid\ten{X}$. The equivalent inverse regression in \eqref{eq:inverse-regression-sdr} provides exhaustive characterization of $\ten{R}(\ten{X})$.
The usual tool to identify sufficient statistics is the factorization theorem that requires a distributional model. Here we assume the distribution of $\ten{X}\mid Y$ belongs to the \emph{quadratic exponential family} in order to (a) simplify modeling and (b) keep estimation feasible. An important feature of the \emph{quadratic exponential family} is that its members are characterized by their first two moments. Specifically, we assume that $\ten{X}\mid Y$ is a full rank quadratic exponential family with density
\begin{align}
f_{\mat{\eta}_y}(\ten{X}\mid Y = y)
&= h(\ten{X})\exp(\t{\mat{\eta}_y}\mat{t}(\ten{X}) - b(\mat{\eta}_y)) \nonumber \\
&= h(\ten{X})\exp(\langle \mat{t}_1(\ten{X}), \mat{\eta}_{1y} \rangle + \langle \mat{t}_2(\ten{X}), \mat{\eta}_{2y} \rangle - b(\mat{\eta}_{y})) \label{eq:quad-density}
\end{align}
where $\mat{t}_1(\ten{X})=\vec \ten{X}$ and $\mat{t}_2(\ten{X})$ is linear in $\ten{X}\circ\ten{X}$. The dependence of $\ten{X}$ on $Y$ is fully captured in the natural parameter $\mat{\eta}_y$. The function $h$ is non-negative real-valued. For $b$ we assume it is at least twice continuously differentiable and structly convex.
Distributions within the quadratic exponential family include the \emph{tensor normal} \todo{cite, if can be found} and \emph{tensor Ising model} \todo{cite} (a generalization of the (inverse) Ising model which is multi-variate Bernoulli with up to second order interactions) and mixtures of these two.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{The Generalized Multi-Linear Model}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
In model \eqref{eq:quad-density}, the relationship of $\ten{X}$ and $Y$ is absorbed in $\mat{\eta}_y$, and $\mat{t}(\ten{X})$ is the minimal sufficient statistic for the \textit{pseudo}-parameter\footnote{$\mat{\eta}_y$ is a function of the response $Y$, thus it is not a parameter in the formal statistical sense. It is considered as a parameter when using the equivalence in \eqref{eq:inverse-regression-sdr} and view $Y$ as a parameter as a device to derive the sufficient reduction from the inverse regression.} $\mat{\eta}_y = (\mat{\eta}_{1y}, \mat{\eta}_{2y})$ with
\begin{align}\label{eq:t-stat}
\mat{t}(\ten{X}) &= (\mat{t}_1(\ten{X}),\mat{t}_2(\ten{X}))=(\vec{\ten{X}}, \mat{T}_2\vech((\vec\ten{X})\t{(\vec\ten{X})})),
\end{align}
where the $d\times p(p + 1) / 2$ dimensional matrix $\mat{T}_2$ with $p = \prod_{i = 1}^r p_i$ ensures that $\mat{\eta}_{2y}$ is of minimal dimension $d$. The matrix $\mat{T}_2$ is of full rank $d$ and is unique to specific members of the quadratic exponential family.
We can reexpress the exponent in \eqref{eq:quad-density} as
\begin{align*}
\t{\mat{\eta}_y} \mat{t}(\ten{X})
&= \langle \vec \ten{X}, \mat{\eta}_{1y} \rangle + \langle \mat{T}_2\vech(\ten{X}\circ\ten{X}), \mat{\eta}_{2y} \rangle = \langle \vec \ten{X}, \mat{\eta}_{1y} \rangle + \langle \vec(\ten{X}\circ\ten{X}), \t{(\mat{T}_2\pinv{\mat{D}_p})}\mat{\eta}_{2y} \rangle
\end{align*}
where $\mat{D}_p$ is the \emph{duplication matrix} from \textcite[Ch.~11]{MatrixAlgebra-AbadirMagnus2005}, defined so that $\mat{D}_p\vech \mat{A} = \vec \mat{A}$ for every symmetric $p\times p$ matrix $\mat{A}$, and $\pinv{\mat{D}_p}$ is its Moore-Penrose pseudo inverse. The first natural parameter component, $\mat{\eta}_{1y}$, captures the first order, and $\mat{\eta}_{2y}$, the second order relationship of $Y$ and $\ten{X}$. The quadratic exponential density of $\ten{X} \mid Y$ can then be expressed as
\begin{equation}\label{eq:quadratic-exp-fam}
f_{\eta_y}(\ten{X}\mid Y = y) = h(\ten{X})\exp(\langle \vec \ten{X}, \mat{\eta}_{1y} \rangle + \langle \vec(\ten{X}\circ\ten{X}), \t{(\mat{T}_2\pinv{\mat{D}_p})}\mat{\eta}_{2y} \rangle - b(\mat{\eta}_y))
\end{equation}
The exponential family in \eqref{eq:quadratic-exp-fam} is easily generalizable to any order. This, though, would result in the number of parameters becoming prohibitive to estimate. This is also the reason why we opted for the second order exponential family in our formulation.
By the equivalence in \eqref{eq:inverse-regression-sdr}, in order to find the sufficient reduction $\ten{R}(\ten{X})$ we need to infer $\mat{\eta}_{1y}$, and $\mat{\eta}_{2y}$. This is reminiscent of generalized linear modeling, which we extend to a multi-linear formulation next.
Suppose $\ten{F}_y$ is a known mapping of $y$ with zero expectation $\E_Y\ten{F}_Y = 0$. We assume the dependence of $\ten{X}$ and $Y$ is reflected only in the first parameter and let
\begin{align}
\mat{\eta}_{1y} &= \vec{\overline{\ten{\eta}}} + \mat{B}\vec\ten{F}_y, \label{eq:eta1-manifold} \\
\mat{\eta}_{2} &= \t{(\pinv{(\mat{T}_2\pinv{\mat{D}_p})})}\vec(c\,\mat{\Omega}), \label{eq:eta2-manifold}
\end{align}
where $\overline{\ten{\eta}}\in\mathbb{R}^{p_1\times\ldots\times p_r}$, $\mat{\Omega} \in \mathbb{R}^{p \times p}$ is positive definite with $p = \prod_{j = 1}^{r} p_j$, and $c\in\mathbb{R}$ is a known constant determined by the distribution to ease modeling. That is, we assume that only $\mat{\eta}_{1y}$ depends on $Y$ through $\mat{B}$. The second parameter $\mat{\eta}_2$ captures the second order interaction structure of $\ten{X}$, which we assume not to depend on the response $Y$. In order to relate individual modes of $\ten{X}$ to the response, allowing flexibility in modeling, we assume $\ten{F}_y$ takes values in $\mathbb{R}^{q_1\times ...\times q_r}$; that is, $\ten{F}_y$ is a tensor valued independent variable in the inverse regression setting. This, in turn, leads to imposing corresponding tensor structure to the regression parameter $\mat{B}$. Thus, \eqref{eq:eta1-manifold} becomes
\begin{align}
\mat{\eta}_{1y} &=
\vec\biggl(\overline{\ten{\eta}} + \ten{F}_y\mlm_{j = 1}^{r}\mat{\beta}_j\biggr), \label{eq:eta1}
\end{align}
where $\mat{B} = \bigotimes_{j = r}^{1}\mat{\beta}_j$ and the component matrices $\mat{\beta}_j\in\mathbb{R}^{p_j\times q_j}$ are of known rank for $j = 1, \ldots, r$. Given the high potential value of $p$, we further assume that
\begin{align}
\t{(\mat{T}_2\pinv{\mat{D}_p})}\mat{\eta}_{2y}= \t{(\mat{T}_2\pinv{\mat{D}_p})}\mat{\eta}_{2} &= \vec\biggl(c\bigotimes_{j = r}^{1}\mat{\Omega}_j\biggr). \label{eq:eta2}
\end{align}
where $\mat{\Omega}_j\in\mathbb{R}^{p_j\times p_j}$ are symmetric positive definite matrices for $j = 1, \ldots, r$. That is, we require $\mat{\Omega} = \bigotimes_{j = r}^{1}\mat{\Omega}_j$, which substantially reduces the number of parameters to estimate in $\mat{\Omega}$. The assumption that the $\mat{\Omega}_j$'s be positive definite is possible due to the constant $c$.
Equation \eqref{eq:eta2} is underdetermined since $\t{(\mat{T}_2\pinv{\mat{D}_p})}$ has full column rank $d < p^2$ (with a non-strict inequality if $\ten{X}$ is univariate) but $\mat{\eta}_2$ is uniquely determined given any $\mat{\Omega}$ as $\t{(\pinv{(\mat{T}_2\pinv{\mat{D}_p})})}$ has full row rank. We let $\mat{\xi} = (\vec{\overline{\ten{\eta}}}, \vec{\mat{B}}, \vech{\mat{\Omega}})$ be a $p(p + 2 q + 3) / 2$-parameter vector and $\mat{\theta} = (\vec{\overline{\ten{\eta}}}, \vec{\mat{B}}, \vech{\mat{\Omega}})$ be the constrained parameter vector, where $\mat{B}=\bigotimes_{j = r}^{1}\mat{\beta}_j$ and $\mat{\Omega} = \bigotimes_{j = r}^{1}\mat{\Omega}_j$. We also let $\Xi$ and $\Theta$ denote the unconstrained and constrained parameter spaces, with $\mat{\xi}$ and $\mat{\theta}$ varying in $\Xi$ and $\Theta$, respectively. The parameter space $\Xi$ is an open subset of $\mathbb{R}^{p(p + 2 q + 3) / 2}$ so that \eqref{eq:quadratic-exp-fam} is a proper density. Later, we relax the assumptions for $\mat{\beta}_k$ and $\mat{\Omega}_k$ as a consequence of \cref{thm:param-manifold} in \cref{sec:kron-manifolds}.
% \todo{Maybe already here introduce the ``constraint'' set of $\Omega$'s allowed as $\{ \Omega\in\mathbb{R}_{++}^{p\times p} : \vec{\Omega} = \t{(\pinv{(\mat{T}_2\pinv{\mat{D}_p})}\mat{T}_2\pinv{\mat{D}_p})}\vec{\Omega} \}$}
In a classical \emph{generalized linear model} (GLM), the link function connecting the natural parameters to the expectation of the sufficient statistic $\mat{\eta}_y = \mat{g}(\E[\mat{t}(\ten{X}) \mid Y = y])$ is invertible. Such a link may not exist in our setting, but for our purpose what we call the ``inverse'' link suffices. The ``inverse'' link $\widetilde{\mat{g}}$ exists as the natural parameters fully describe the distribution. As in the non-minimal formulation \eqref{eq:quadratic-exp-fam}, we define the ``inverse'' link through its tensor valued components
\begin{align}
\ten{g}_1(\mat{\eta}_y) &= \E[\ten{X} \mid Y = y], \label{eq:inv-link1}\\
\ten{g}_2(\mat{\eta}_y) &= \E[\ten{X}\circ\ten{X} \mid Y = y] \label{eq:inv-link2}
\end{align}
as $\widetilde{\mat{g}}(\mat{\eta}_y) = (\vec\ten{g}_1(\mat{\eta}_y), \vec\ten{g}_2(\mat{\eta}_y))$.
Under the quadratic exponential family model \eqref{eq:quadratic-exp-fam}, a sufficient reduction for the regression of $Y$ on $\ten{X}$ is given in \cref{thm:sdr}.
\begin{theorem}[SDR]\label{thm:sdr}
A sufficient reduction for the regression $Y\mid \ten{X}$ under the quadratic exponential family inverse regression model \eqref{eq:quadratic-exp-fam} with natural parameters \eqref{eq:eta1} and \eqref{eq:eta2} is given by
\begin{align}\label{eq:sdr}
\ten{R}(\ten{X})
% &= (\ten{X} - \E\ten{X})\times\{ \t{\mat{\beta}_1}, \ldots, \t{\mat{\beta}_r} \}.
&= (\ten{X} - \E\ten{X})\mlm_{k = 1}^{r}\t{\mat{\beta}_j}.
\end{align}
The reduction \eqref{eq:sdr} is minimal if $\mat{\beta}_j$ are full rank for all $j=1,\ldots,r$.
\end{theorem}
The reduction in vectorized form is $\vec\ten{R}(\ten{X})=\t{\mat{B}}\vec(\ten{X} - \E\ten{X})$, where $\mat{B} = \bigotimes_{k = r}^{1}\mat{\beta}_k$ with $\Span(\mat{B}) = \Span(\{\mat{\eta}_{1y} - \E_{Y}\mat{\eta}_{1Y} : y\in\mathcal{S}_Y\})$, using $\mathcal{S}_Y$ to denote the set of values of the random variable $Y$.
\cref{thm:sdr} obtains that the \emph{sufficient reduction} $\ten{R}(\ten{X})$ reduces $\ten{X}$ along each dimension linearly. The graph in \cref{fig:SDRvisual} is a visual depiction of the sufficient reduction.
\begin{figure}
\centering
\begin{scaletikzpicturetowidth}{0.5\textwidth}
\input{images/reduction.tex}
\end{scaletikzpicturetowidth}
\caption{\label{fig:SDRvisual}Visual depiction of the sufficient reduction in \cref{thm:sdr}.}
\end{figure}
\begin{example}[Vector valued $\mat{x}$ ($r = 1$)]\label{ex:vector-dist}
Given vector valued predictor $\mat{X}\in\mathbb{R}^p$, the tensor order is $r = 1$, then the collection of parameters is $\mat{\theta} = (\overline{\mat{\eta}}, \mat{\beta}, \mat{\Omega})$ with $\overline{\mat{\eta}}\in\mathbb{R}^p$, $\mat{\beta}\in\StiefelNonCompact{p}{q}$ and $\mat{\Omega}\in\SymPosDefMat{p}$ where $\mat{f}_y\in\mathbb{R}^q$ are known functions of the response $Y$. The conditional density of $\mat{X}\mid Y = y$ is given by
\begin{align*}
f_{\theta}(\mat{x}\mid Y = y)
&= h(\mat{x})\exp(\langle\mat{x}, \mat{\eta}_{1y}(\mat{\theta})\rangle + \langle\vec(\mat{x}\circ\mat{x}), \mat{\eta}_2(\mat{\theta})\rangle - b(\mat{\eta}_y(\mat{\theta}))) \\
% &= h(\mat{x})\exp(\t{\mat{\eta}_{1y}(\theta)}\mat{x} + \t{\vec(\mat{x}\circ\mat{x})}\mat{\eta}_2(\mat{\theta}) - b(\mat{\eta}_y(\mat{\theta}))) \\
&= h(\mat{x})\exp(\t{(\overline{\mat{\eta}} + \mat{\beta}\mat{f}_y)}\mat{x} + c\,\t{\mat{x}}\mat{\Omega}\,\mat{x} - b(\mat{\eta}_y(\mat{\theta}))).
\end{align*}
using the relation of $\mat{\theta}$ to the natural parameters given by $\mat{\eta}_{1y}(\mat{\theta}) = \overline{\mat{\eta}} + \mat{\beta}\mat{f}_y$ and $\mat{\eta}_2(\theta) = c\,\mat{\Omega}$.
% where the number of unknown parameters is $p + \dim(\StiefelNonCompact{p}{q}) + \dim(\SymPosDefMat{p}) = p\frac{p + 2 q + 3}{2}$.
\end{example}
\begin{example}[Matrix valued $\mat{X}$ ($r = 2$)]
Assuming $\mat{X}$ to be matrix valued, that is $r = 2$, $\mat{\theta} = (\overline{\mat{\eta}}, \mat{\beta}_1, \mat{\beta}_2, \mat{\Omega}_1, \mat{\Omega}_2)$, where the intercept term $\overline{\mat{\eta}}\in\mathbb{R}^{p_1\times p_2}$ is now matrix valued. Similar to \cref{ex:vector-dist} with $\mat{F}_y\in\mathbb{R}^{q_1\times q_2}$ being matrix valued, the conditional density of $\mat{X}\mid Y = y$ reads
\begin{align*}
f_{\mat{\theta}}(\mat{X}\mid Y = y)
&= h(\mat{X})\exp(\langle\vec{\mat{X}}, \mat{\eta}_{1y}(\mat{\theta})\rangle + \langle\vec(\mat{X}\circ\mat{X}), \mat{\eta}_2(\mat{\theta})\rangle - b(\mat{\eta}_y(\mat{\theta}))) \\
&= h(\mat{X})\exp(\tr((\overline{\mat{\eta}} + \mat{\beta}_1\mat{F}_y\t{\mat{\beta}_2})\t{\mat{X}}) + c \tr(\mat{\Omega}_1\mat{X}\mat{\Omega}_2\t{\mat{X}}) - b(\mat{\eta}_y(\mat{\theta}))).
\end{align*}
\end{example}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Maximum Likelihood Estimation}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Suppose $(\ten{X}_i, Y_i)$ are independently and identically distributed with joint cdf $F(\ten{X}, Y)$, for $i = 1, \ldots, n$. The empirical log-likelihood function of \eqref{eq:quadratic-exp-fam} under \eqref{eq:eta1} and \eqref{eq:eta2}, ignoring terms not depending on the parameters, is
\begin{equation}\label{eq:log-likelihood}
l_n(\mat{\theta}) = \frac{1}{n}\sum_{i = 1}^n \biggl(\Bigl\langle\overline{\ten{\eta}} + \ten{F}_{y_i}\mlm_{k = 1}^{r}\mat{\beta}_k, \ten{X}_i \Bigr\rangle + c\Bigl\langle\ten{X}_i\mlm_{k = 1}^{r}\mat{\Omega}_k, \ten{X}_i \Bigr\rangle - b(\mat{\eta}_{y_i})\biggr).
\end{equation}
The maximum likelihood estimate of $\mat{\theta}_0$ is the solution to the optimization problem
\begin{equation}\label{eq:mle}
\hat{\mat{\theta}}_n = \argmax_{\mat{\theta}\in\Theta}l_n(\mat{\theta})
\end{equation}
with $\hat{\mat{\theta}}_n = (\vec\widehat{\overline{\ten{\eta}}}, \vec\widehat{\mat{B}}, \vech\widetilde{\mat{\Omega}})$ where $\widehat{\mat{B}} = \bigkron_{k = r}^{1}\widehat{\mat{\beta}}_k$ and $\widehat{\mat{\Omega}} = \bigkron_{k = r}^{1}\widehat{\mat{\Omega}}_k$.
A straightforward and general method for parameter estimation is \emph{gradient descent}. To apply gradient based optimization, we compute the gradients of $l_n$ in \cref{thm:grad}.
\begin{theorem}\label{thm:grad}
For $n$ i.i.d. observations $(\ten{X}_i, y_i), i = 1, ..., n$ the log-likelihood is of the form in \eqref{eq:log-likelihood} with $\mat{\theta}$ being the collection of all GMLM parameters $\overline{\ten{\eta}}$, ${\mat{B}} = \bigkron_{k = r}^{1}{\mat{\beta}}_k$ and ${\mat{\Omega}} = \bigkron_{k = r}^{1}{\mat{\Omega}}_k$ for $k = 1, ..., r$. Let $\ten{G}_2(\mat{\eta}_y)$ be a tensor of dimensions $p_1, \ldots, p_r$ such that
\begin{displaymath}
\vec{\ten{G}_2(\mat{\eta}_y)} = \pinv{(\mat{T}_2\pinv{\mat{D}_p})}\mat{T}_2\pinv{\mat{D}_p}\vec{\ten{g}_2(\mat{\eta}_y)}.
\end{displaymath}
Then, the partial gradients with respect to $\overline{\ten{\eta}}, \mat{\beta}_1, \ldots, \mat{\beta}_r, \mat{\Omega}_1, \ldots, \mat{\Omega}_r$ are given by
\begin{align*}
\nabla_{\overline{\ten{\eta}}}l_n &= \vec\frac{1}{n}\sum_{i = 1}^n (\ten{X}_i - \ten{g}_1(\mat{\eta}_{y_i})), \\
\nabla_{\mat{\beta}_j}l_n &= \vec\frac{1}{n}\sum_{i = 1}^n (\ten{X}_i - \ten{g}_1(\mat{\eta}_{y_i}))_{(j)}\t{\Big(\ten{F}_{y_i}\mlm_{k\in[r]\backslash j}\mat{\beta}_k\Big)_{(j)}}, \\
\nabla_{\mat{\Omega}_j}l_n &= \vec\frac{c}{n}\sum_{i = 1}^n (\ten{X}_i\otimes\ten{X}_i - \K(\ten{G}_2(\mat{\eta}_{y_i})))\mlm_{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})}
\end{align*}
which obtains $\nabla l_n = (\nabla_{\overline{\ten{\eta}}}l_n, \nabla_{\mat{\beta}_1}l_n, \ldots, \nabla_{\mat{\beta}_r}l_n, \nabla_{\mat{\Omega}_1}l_n, \ldots, \nabla_{\mat{\Omega}_r}l_n)$.
If $\mat{T}_2$ is the identity matrix $\mat{I}_{p(p + 1) / 2}$, then $\ten{G}_2(\mat{\eta}_y) = \ten{g}_2(\mat{\eta}_y)$.
\end{theorem}
Although the general case of any GMLM model can be fitted via gradient descent using \cref{thm:grad}, this may be very inefficient. In \cref{thm:grad}, $\mat{T}_2$ can be used to introduce flexible second moment structures. For example, it allows modeling effects differently for predictor components, as described in \cref{sec:ising_estimation} after Eqn. \eqref{eq:ising-cond-prob}. In the remainder, we focus on $\mat{T}_2$'s that are identity matrices. This approach simplifies the estimation algorithm and the speed of the numerical calculation in the case of tensor normal predictors. In the case of the tensor normal distribution, an iterative cyclic updating scheme is derived in \cref{sec:tensor_normal_estimation}, which has much faster convergence, is stable and does not require hyper parameters, as will be discussed later. On the other hand, the Ising model does not allow such a scheme. There we need to use a gradient based method, which is the subject of \cref{sec:ising_estimation}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Tensor Normal}\label{sec:tensor_normal_estimation}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Suppose $\ten{X}\mid Y = y$ follows a tensor normal distribution with mean $\ten{\mu}_y$ and covariance $\mat{\Sigma} = \bigkron_{k = r}^{1}\mat{\Sigma}_k$. We assume the distribution is non-degenerate which means that the covariances $\mat{\Sigma}_k$ are symmetric positive definite matrices. Its density is given by
\begin{displaymath}
f_{\mat{\theta}}(\ten{X}\mid Y = y) = (2\pi)^{-p / 2}\prod_{k = 1}^{r}\det(\mat{\Sigma}_k)^{-p / 2 p_k}\exp\left( -\frac{1}{2}\left\langle\ten{X} - \ten{\mu}_y, (\ten{X} - \ten{\mu}_y)\mlm_{k = 1}^{r}\mat{\Sigma}_k^{-1} \right\rangle \right).
\end{displaymath}
For the sake of simplicity and w.l.o.g., we assume $\ten{X}$ has 0 marginal expectation; i.e., $\E\ten{X} = 0$. Rewriting this in the quadratic exponential family form \eqref{eq:quadratic-exp-fam}, determines the scaling constant $c = -1/2$. The relation to the GMLM parameters $\overline{\ten{\eta}}, \mat{\beta}_k$ and $\mat{\Omega}_k$, for $k = 1, \ldots, r$ is
\begin{displaymath}
\ten{\mu}_y = \ten{F}_y\mlm_{k = 1}^{r}\mat{\Omega}_k^{-1}\mat{\beta}_k, \qquad
\mat{\Omega}_k = \mat{\Sigma}_k^{-1},
\end{displaymath}
where we used that $\overline{\ten{\eta}} = 0$ due to $0 = \E\ten{X} = \E\E[\ten{X}\mid Y] = \E\ten{\mu}_Y$ in combination with $\E\ten{F}_Y = 0$. Additionally, all the $\mat{\Omega}_k$'s are symmetric positive definite, because the $\mat{\Sigma}_k$'s are. This lead to another simplification since then $\mat{T}_2$ in \eqref{eq:t-stat} equals the identity. This also means that the gradients of the log-likelihood $l_n$ in \cref{thm:grad} are simpler. We obtain
\begin{displaymath}
\ten{g}_1(\mat{\eta}_y) = \E[\ten{X}\mid Y = y] = \ten{\mu}_y, \qquad
\ten{G}_2(\mat{\eta}_y) = \ten{g}_2(\mat{\eta}_y) = \E[\ten{X}\circ\ten{X}\mid Y = y] \equiv \bigkron_{k = r}^1\mat{\Sigma}_k + (\vec{\ten{\mu}}_y)\t{(\vec{\ten{\mu}}_y)}.
\end{displaymath}
In practice, we assume we have a random sample of $n$ observations $(\ten{X}_i, \ten{F}_{y_i})$ from the joint distribution. We start the estimation process by demeaning them. Then, only the reduction matrices $\mat{\beta}_k$ and the scatter matrices $\mat{\Omega}_k$ need to be estimated. To solve the optimization problem \eqref{eq:mle}, with $\overline{\ten{\eta}} = 0$ we initialize the parameters using a simple heuristic approach. % For initial estimates $\hat{\mat{\beta}}_k^{(0)}$ we
First, we compute moment based mode-wise marginal covariance estimates $\widehat{\mat{\Sigma}}_k(\ten{X})$ and $\widehat{\mat{\Sigma}}_k(\ten{F}_Y)$ as
\begin{displaymath}
\widehat{\mat{\Sigma}}_k(\ten{X}) = \frac{1}{n}\sum_{i = 1}^{n} (\ten{X}_i)_{(k)}\t{(\ten{X}_i)_{(k)}}, \qquad
\widehat{\mat{\Sigma}}_k(\ten{F}_Y) = \frac{1}{n}\sum_{i = 1}^{n} (\ten{F}_{y_i})_{(k)}\t{(\ten{F}_{y_i})_{(k)}}.
\end{displaymath}
Then, for every mode $k = 1, \ldots, r$, we compute the first $j = 1, \ldots, q_k$ eigenvectors $\mat{v}_j(\widehat{\mat{\Sigma}}_k(\ten{X}))$, $\mat{v}_j(\widehat{\mat{\Sigma}}_k(\ten{F}_Y))$ and eigenvalues $\lambda_j(\widehat{\mat{\Sigma}}_k(\ten{X}))$, $\lambda_j(\widehat{\mat{\Sigma}}_k(\ten{X}))$ of the marginal covariance estimates. We set
\begin{align*}
\mat{U}_k &= (\mat{v}_1(\widehat{\mat{\Sigma}}_1(\ten{X})), \ldots, \mat{v}_{q_k}(\widehat{\mat{\Sigma}}_{q_k}(\ten{X}))), \\
\mat{D}_k &= \diag(\mat{v}_1(\widehat{\mat{\Sigma}}_1(\ten{X}))\mat{v}_1(\widehat{\mat{\Sigma}}_1(\ten{F}_{Y})), \ldots, \mat{v}_{q_k}(\widehat{\mat{\Sigma}}_{q_k}(\ten{X}))\mat{v}_{q_k}(\widehat{\mat{\Sigma}}_k(\ten{F}_{Y}))), \\
\mat{V}_k &= (\mat{v}_1(\widehat{\mat{\Sigma}}_1(\ten{F}_Y), \ldots, \mat{v}_{q_k}(\widehat{\mat{\Sigma}}_{q_k}(\ten{F}_Y)). \\
\end{align*}
The initial value of $\mat{\beta}_k$ is
\begin{displaymath}
\hat{\mat{\beta}}_k^{(0)} = \mat{U}_k\sqrt{\mat{D}_k}\t{\mat{V}_k},
\end{displaymath}
and the initial value of $\mat{\Omega}_k$ is set to the identity $\mat{\Omega}_k^{(0)} = \mat{I}_{p_k}$, for $k=1,\ldots,r$.
Given $\hat{\mat{\beta}}_1, \ldots, \hat{\mat{\beta}}_r, \hat{\mat{\Omega}}_1, \ldots, \hat{\mat{\Omega}}_r$, we take the gradient $\nabla_{\mat{\beta}_j}l_n$ of the tensor normal log-likelihood $l_n$ in \eqref{eq:log-likelihood} applying \cref{thm:grad} and keep all other parameters except $\mat{\beta}_j$ fixed. Then, $\nabla_{\mat{\beta}_j}l_n = 0$ has the closed form solution
\begin{equation}\label{eq:tensor_normal_beta_solution}
\t{\mat{\beta}_j} = \biggl(
\sum_{i = 1}^{n}
\Bigl( \ten{F}_{y_i}\mlm_{k \neq j}\hat{\mat{\Omega}}_k^{-1}\hat{\mat{\beta}}_k \Bigr)_{(j)}
\t{\Bigl( \ten{F}_{y_i}\mlm_{k \neq j}\hat{\mat{\beta}}_k \Bigr)_{(j)}}
\biggr)^{-1}
\biggl(
\sum_{i = 1}^{n}
\Bigl( \ten{F}_{y_i}\mlm_{k \neq j}\hat{\mat{\beta}}_k \Bigr)_{(j)}
\t{(\ten{X}_{i})_{(j)}}
\biggr)
\hat{\mat{\Omega}}_j.
\end{equation}
%For the scatter matrices $\mat{\Omega}_j$, we need to fudge a bit.
Equating the partial gradient of the $j$th scatter matrix $\mat{\Omega}_j$ in \cref{thm:grad} to zero ( $\nabla_{\mat{\Omega}_j}l_n = 0$) gives a quadratic matrix equation. This is due to the dependence of $\ten{\mu}_y$ on $\mat{\Omega}_j$. In practice though, it is faster, more stable, and equally accurate to use mode-wise covariance estimates via the residuals
\begin{displaymath}
\hat{\ten{R}}_i = \ten{X}_i - \hat{\ten{\mu}}_{y_i} = \ten{X}_i - \ten{F}_{y_i}\mlm_{k = 1}^{r}\hat{\mat{\Omega}}_k^{-1}\hat{\mat{\beta}}_k.
\end{displaymath}
The estimates are computed via
\begin{displaymath}
\tilde{\mat{\Sigma}}_j = \sum_{i = 1}^n (\hat{\ten{R}}_i)_{(j)} \t{(\hat{\ten{R}}_i)_{(j)}},
\end{displaymath}
where $\tilde{s}\tilde{\mat{\Sigma}}_j = \hat{\mat{\Omega}}_j^{-1}$. For scaling we use that the mean squared error has to be equal to the trace of the covariance estimate,
\begin{displaymath}
\frac{1}{n}\sum_{i = 1}^n \langle \hat{\ten{R}}_i, \hat{\ten{R}}_i \rangle = \tr\bigkron_{k = r}^{1}\hat{\mat{\Omega}}_k^{-1} = \prod_{k = 1}^{r}\tr{\hat{\mat{\Omega}}_k^{-1}} = \tilde{s}^r\prod_{k = 1}^{r}\tr{\tilde{\mat{\Sigma}}_k},
\end{displaymath}
so that
\begin{displaymath}
\tilde{s} = \biggl(\Bigl(\prod_{k = 1}^{r}\tr{\tilde{\mat{\Sigma}}_k}\Bigr)^{-1}\frac{1}{n}\sum_{i = 1}^n \langle \hat{\ten{R}}_i, \hat{\ten{R}}_i \rangle\biggr)^{1 / r}
\end{displaymath}
resulting in the estimates $\hat{\mat{\Omega}}_j = (\tilde{s}\tilde{\mat{\Sigma}}_j)^{-1}$.
Estimation is then performed by updating the estimates $\hat{\mat{\beta}}_j$ via \eqref{eq:tensor_normal_beta_solution} for $j = 1, \ldots, r$, and then recompute the $\hat{\mat{\Omega}}_j$ estimates simultaneously keeping the $\hat{\mat{\beta}}_j$'s fixed. This procedure is repeated until convergence. % Convergence is very fast, experiments showed that convergence occures usualy in less than $10$ iterations.
A technical detail for numerical stability is to ensure that the scaled values $\tilde{s}\tilde{\mat{\Sigma}}_j$, assumed to be symmetric and positive definite, are well conditioned. Thus, we estimate the condition number of $\tilde{s}\tilde{\mat{\Sigma}}_j$ prior to computing the inverse. In case of ill- conditioning, we use the regularized $\hat{\mat{\Omega}}_j = (\tilde{s}\tilde{\mat{\Sigma}}_j + 0.2 \lambda_{1}(\tilde{s}\tilde{\mat{\Sigma}}_j)\mat{I}_{p_j})^{-1}$ instead, where $\lambda_{1}(\tilde{s}\tilde{\mat{\Sigma}}_j)$ is the first (maximum) eigenvalue. Experiments showed that this regularization is usually only required in the first few iterations.
Furthermore, if the parameter space follows a more general setting as in \cref{thm:param-manifold}, updating may produces estimates outside the parameter space. A simple and efficient method is to project every updated estimate onto the corresponding manifold.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Ising Model}\label{sec:ising_estimation}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The general distribution of a binary vector is modeled by the \emph{multi-variate Bernoulli distribution} (\textcite{GraphicalModels-Whittaker2009, MVB-Dai2012, MVB-DaiDingWahba2013}). The \emph{Ising model} \textcite{Ising-Ising1924} is a special case, considering only two-way interactions. Its probability mass function (PMF) for a binary random vector $X\in\{ 0, 1 \}^p$ with natural parameters $\mat{\gamma}\in\mathbb{R}^{p(p + 1) / 2}$ is given by
\begin{displaymath}
P_{\mat{\gamma}}(\mat{x}) = p_0(\mat{\gamma})\exp(\t{\vech(\mat{x}\t{\mat{x}})}\mat{\gamma}).
\end{displaymath}
The scaling factor $p_0(\mat{\gamma})\in\mathbb{R}_{+}$ ensures that $P_{\mat{\gamma}}$ is a PMF. It is equal to the probability of the zero event $P(X = \mat{0}) = p_0(\mat{\gamma})$. More commonly known as the \emph{partition function}, the reciprocal of $p_0$, is given by
\begin{equation}\label{eq:ising-partition-function}
p_0(\mat{\gamma})^{-1} = \sum_{\mat{x}\in\{0, 1\}^p}\exp(\t{\vech(\mat{x}\t{\mat{x}})}\mat{\gamma}).
\end{equation}
By an abuse of notation, let $\mat{\gamma}_{j l}$ denote the element of $\mat{\gamma}$ corresponding to $\mat{x}_j\mat{x}_l$ in $\vech(\mat{x}\t{\mat{x}})$\footnote{Specifically, the element $\mat{\gamma}_{j l}$ of $\mat{\gamma}$ is a short hand for $\mat{\gamma}_{\iota(j, l)}$ with $\iota(j, l) = (\min(j, l) - 1)(2 p - \min(j, l)) / 2 + \max(j, l)$ mapping the matrix row index $j$ and column index $l$ to the corresponding half vectorization indices $\iota(j, l)$.}. The ``diagonal'' parameter $\mat{\gamma}_{j j}$ expresses the conditional log odds of $X_j = 1\mid X_{-j} = \mat{0}$, where the negative subscript in $X_{-j}$ describes the $p - 1$ dimensional vector $X$ with the $j$th element removed. The ``off diagonal'' parameters $\mat{\gamma}_{j l}$, for $j\neq l$, are equal to the conditional log odds of simultanious occurence $X_j = 1, X_l = 1 \mid X_{-j, -l} = \mat{0}$. More precise, for $j\neq l$, the conditional probabitities and the natural parameters are related by
\begin{align}
\mat{\gamma}_{j j} &= \log\frac{P_{\mat{\gamma}}(X_j = 1\mid X_{-j} = \mat{0})}{1 - P_{\mat{\gamma}}(X_j = 1\mid X_{-j} = \mat{0})}, \nonumber \\
\mat{\gamma}_{j l} &= \log\frac{1 - P_{\mat{\gamma}}(X_j = 1\mid X_{-j} = \mat{0})P_{\mat{\gamma}}(X_l = 1\mid X_{-l} = \mat{0})}{P_{\mat{\gamma}}(X_j = 1\mid X_{-j} = \mat{0})P_{\mat{\gamma}}(X_l = 1\mid X_{-l} = \mat{0})}\frac{P_{\mat{\gamma}}(X_j = 1, X_l = 1\mid X_{-j, -l} = \mat{0})}{1 - P_{\mat{\gamma}}(X_j = 1, X_l = 1\mid X_{-j, -l} = \mat{0})} \label{eq:ising-two-way-log-odds}.
\end{align}
Conditional Ising models, incorporating the information of covariates $Y$ into the model, have also been considered \textcite{sparseIsing-ChengEtAt2014, sdr-mixedPredictors-BuraForzaniEtAl2022}. The direct way is to parameterize $\mat{\gamma} = \mat{\gamma}_y$ by the covariate $Y = y$ to model a conditional distribution $P_{\mat{\gamma}_y}(\mat{x}\mid Y = y)$.
We extend the conditional PMF by allowing the binary variables to be tensor values, that is for $\ten{X}\in\{ 0, 1 \}^{p_1\times\cdots\times p_r}$ we set $\mat{x} = \vec{\ten{X}}$, with dimension $p = \prod_{k = 1}^{r}p_k$. Considering the tensor structure of $\ten{X}$ is done by assuming Kronecker product constraints to the parameter vector $\mat{\gamma}_y$ in a similar fashion as for the tensor normal model. This means that we compare the PMF $P_{\mat{\gamma}_y}(\vec{\ten{X}} | Y = y)$ to the quadratic exponential family \eqref{eq:quadratic-exp-fam} with the natural parameters modeled by \eqref{eq:eta1} and \eqref{eq:eta2}. A detail to be considered is that the diagonal of $(\vec{\ten{X}})\t{(\vec{\ten{X}})}$ is equal to $\vec{\ten{X}}$. This gives the GMLM model as
\begin{align}
P_{\mat{\gamma}_y}(\ten{X} \mid Y = y)
&= p_0(\mat{\gamma}_y)\exp(\t{\vech((\vec{\ten{X}})\t{(\vec{\ten{X}})})}\mat{\gamma}_y) \nonumber \\
&= p_0(\mat{\gamma}_y)\exp\Bigl(\Bigl\langle \ten{X}, \ten{F}_y\mlm_{k = 1}^{r}\mat{\beta}_k \Bigr\rangle + \Bigl\langle\ten{X}\mlm_{k = 1}^{r}\mat{\Omega}_k, \ten{X}\Bigr\rangle\Bigr)\label{eq:ising-cond-prob}
\end{align}
where we set $\overline{\ten{\eta}} = 0$ and $\mat{T}_2$ to the identity. This is an additional constraint to the model, the reason is that the diagonal elements of $\mat{\Omega} = \bigkron_{k = r}^{1}\mat{\Omega}_k$ take the role of $\overline{\ten{\eta}}$, althoug not fully. Having the diagonal of $\mat{\Omega}$ and $\overline{\ten{\eta}}$ handling the self interaction effects might lead to interference in the optimization routine. Another approach would be to use the $\mat{T}_2$ matrix to set the corresponding diagonal elements of $\mat{\Omega}$ to zero and let $\overline{\ten{\eta}}$ handle the self interaction effect. All of those approaches, namely setting $\overline{\ten{\eta}} = 0$, keeping $\overline{\ten{\eta}}$ or using $\mat{T}_2$, are theoretically solid and compatible with \cref{thm:grad,thm:param-manifold,thm:asymptotic-normality-gmlm}, assuming all axis dimensions $p_k$ are non-degenerate, that is $p_k > 1$ for all $k = 1, \ldots, r$. Regardles, under our modeling choise we get the relation between the natural parameters $\mat{\gamma}_y$ of the conditional Ising model and the GMLM parameters $\mat{\beta}_k$ and $\mat{\Omega}_k$ as
\begin{equation}\label{eq:ising-natural-params}
% \t{\pinv{\mat{D}_p}}\mat{\gamma}_y
% = \vec(\mat{\Omega} + \diag(\mat{B}\vec{\ten{F}_y}))
% = \vec\Biggl(\bigkron_{k = r}^{1}\mat{\Omega}_k + \diag\biggl(\vec\Bigl(\ten{F}_y\mlm_{k = 1}^{r}\mat{\beta}_k\Bigr)\biggr)\Biggr).
\mat{\gamma}_y
= \t{\mat{D}_p}\vec(\mat{\Omega} + \diag(\mat{B}\vec{\ten{F}_y}))
= \t{\mat{D}_p}\vec\Biggl(\bigkron_{k = r}^{1}\mat{\Omega}_k + \diag\biggl(\vec\Bigl(\ten{F}_y\mlm_{k = 1}^{r}\mat{\beta}_k\Bigr)\biggr)\Biggr).
\end{equation}
In contract to the tensor normal GMLM, the matrices $\mat{\Omega}_k$ are only required to be symmetric. More specificaly, we require $\mat{\Omega}_k$, for $k = 1, \ldots, r$, to be elements of an embedded submanifold of $\SymMat{p_k}$ (see: \cref{sec:kron-manifolds,sec:matrix-manifolds}). The mode wise reduction matrices $\mat{\beta}_k$ need to be elements of an embedded submanifold of $\mathbb{R}^{p_k\times q_k}$. Common choises are listed in \cref{sec:matrix-manifolds}. \todo{check if we need to exclude zero here!}
To solve the optimization problem \eqref{eq:mle}, given a data set $(\ten{X}_i, y_i)$, for $i = 1, \ldots, n$, we use a variation of gradient descent.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Initial Values}
The first step is to get reasonable starting values. Experiments showed that a good starting value of $\mat{\beta}_k$, for $k = 1, \ldots, r$, it to use the tensor normal estimates from \cref{sec:tensor_normal_estimation}, interprating $\ten{X}_i$ as continuous. For initial values of $\mat{\Omega}_k$, a different approach is required. Setting everything to the uninformed initial value, that is $\mat{\Omega}_k = \mat{0}$ as this corresponds to the conditional log odds to be $1:1$ for every component and pairwaide interaction. This is not possible, since $\mat{0}$ is a stationary point of the log-likelihood. This is directly observed by taking a look at the partial gradients of the log-likelihood in \cref{thm:grad}. Instead, we use a crude heuristic which threads every mode seperately and ignores any relation to the covariates. It is computationaly cheap and better than any of the alternatives we considered. For every $k = 1, \ldots, r$, let the $k$'th mode second moment estimate be
\begin{equation}\label{eq:ising-mode-moments}
\hat{\mat{M}}_{2(k)} = \frac{p_k}{n p}\sum_{i = 1}^n (\ten{X}_i)_{(k)}\t{(\ten{X}_i)_{(k)}}
\end{equation}
which contains the $k$'th mode first moment estimate in its diagonal $\hat{\mat{M}}_{1(k)} = \diag\hat{\mat{M}}_{2(k)}$. Considering every column of the matricized observation $(\ten{X}_i)_{(k)}$ as a $p_k$ dimensional observation itself. The number of those artifically generated observations is $n \prod_{j\neq k}p_j$. Let $Z_k$ denote the random variable those artifical observations are realization of. Then, we can interprate the elements $(\hat{\mat{M}}_{1(k)})_{j}$ as the estimates of the probability $P((Z_k)_j = 1)$, that is the marginal probability of the $j$th element of $Z_k$ being $1$. Similar, for $l \neq j$ we have $(\hat{\mat{M}}_{2(k)})_{j l}$ estimating $P((Z_k)_j = 1, (Z_k)_l = 1)$, the marginal probability of two-way interactions. % Without any regard of accuracy ...
Now, we set the diagonal elements of $\mat{\Omega}_k$ to zero. For the off diagonal elements of $\mat{\Omega}_k$, we equate the conditional probabilities $P((Z_k)_j = 1 \mid (Z_k)_{-j} = \mat{0})$ and $P((Z_k)_j = 1, (Z_k)_l = 1\mid (Z_k)_{-j, -l} = \mat{0})$ with the marginal probability estimates $(\hat{\mat{M}}_{1(k)})_{j}$ and $(\hat{\mat{M}}_{2(k)})_{j l}$, respectively. Use \eqref{eq:ising-two-way-log-odds} then gives the initial estimates $\hat{\mat{\Omega}}_k^{(0)}$, with $j \neq l$ component wise
\begin{equation}\label{eq:ising-init-Omegas}
(\hat{\mat{\Omega}}_k^{(0)})_{j j} = 0,
\qquad
(\hat{\mat{\Omega}}_k^{(0)})_{j l} = \log\frac{1 - (\hat{\mat{M}}_{1(k)})_{j}(\hat{\mat{M}}_{1(k)})_{l}}{(\hat{\mat{M}}_{1(k)})_{j}(\hat{\mat{M}}_{1(k)})_{l}}\frac{(\hat{\mat{M}}_{2(k)})_{j l}}{1 - (\hat{\mat{M}}_{2(k)})_{j l}}.
\end{equation}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Gradient Optimization}
Given initial values, the gradients provided by \cref{thm:grad} can be evaluated for the Ising model. The first step therefore is to determin the values of the inverse link components $\ten{g}_1(\mat{\gamma}_y) = \E[\ten{X} \mid Y = y]$ and $\ten{G}_2(\mat{\gamma}_y) = \ten{g}_2(\mat{\gamma}_y) = \E[\ten{X}\circ\ten{X} \mid Y = y]$. An imediate simplification is that the first moment is a part of the second moment. Its values are determined via $\vec(\E[\ten{X} \mid Y = y]) = \diag(\E[\ten{X}\circ\ten{X} \mid Y = y]_{(1, \ldots, r)})$. This means only the second moment needs to be computed, or estimated (see: \cref{sec:ising-bigger-dim}) in the case of slightly bigger $p$. For the Ising model, the conditional second moment with parameters $\mat{\gamma}_y$ is given by the matricized relation
\begin{equation}\label{eq:ising-m2}
\ten{g}_2(\ten{\gamma}_y)_{(1, \ldots, r)} = \E[(\vec{\ten{X}})\t{(\vec{\ten{X}})}\mid Y = y] = p_0(\mat{\gamma}_y)\sum_{\mat{x}\in\{0, 1\}^{p}}\mat{x}\t{\mat{x}}\exp(\t{\vech(\mat{x}\t{\mat{x}})}\mat{\gamma}_y).
\end{equation}
The natural parameter $\mat{\gamma}_y$ is evaluated via \eqref{eq:ising-natural-params} enabeling us to compute the partial gradients of the log-likelihood $l_n$ \eqref{eq:log-likelihood} for the Ising model by \cref{thm:grad} for the GMLM parameters $\mat{\beta}_k$ and $\mat{\Omega}_k$, $k = 1, \ldots, r$, at the current iterate $\mat{\theta}^{(I)} = (\mat{\beta}_1^{(I)}, \ldots, \mat{\beta}_r^{(I)}, \mat{\Omega}_1^{(I)}, \ldots, \mat{\Omega}_r^{(I)})$. Using classic gradient ascent for maximizing the log-likelihood, we have to specify a learning rate $\lambda\in\mathbb{R}_{+}$, usualy something like $10^{-3}$. The update rule is
\begin{displaymath}
\mat{\theta}^{(I + 1)} = \mat{\theta}^{(I)} + \lambda\nabla_{\mat{\theta}} l_n(\mat{\theta})\bigr|_{\mat{\theta} = \mat{\theta}^{(I)}},
\end{displaymath}
which is iterated till convergence. In practice, iteration is performed until ether a maximum number of iterations is exhausted and/or some break condition is satisfied. A proper choise of the learning rate is needed as a too large learning rate $\lambda$ causes instabilities, while a too low learning rate requires an enourmes ammount of iterations. Generically, there are two approach against the need to determine a proper lerning rate. First, \emph{line search methods} determin an appropriate step size for every iteration. This works great if the evaluation of the object function (the log-likelihood) is cheap. This is not the case in our setting, see \cref{sec:ising-bigger-dim}. The second approach is an \emph{addaptive learning rate}. The basic idea is to track specific statistics while optimizing and dynamiclly addapt the leaning rate via well tested heuristics using the gathered knowledge from past iterations. We opted to use an addaptive leaning rate approach, this not only levaites the need to determin an approriate leaning rate, but also excelerates learning.
Our method of choise is RMSprop, which stands for \emph{root mean squared propagation} \textcite{rmsprop-Hinton2012}. This is a well known method in maschine learning for training neural networks. Its a variation of gradient descent with an per scalar parameter addaptive learning rate. It tracks a moving average of the element wise squared gradient $\mat{g}_2^{(I)}$, which is then used to scale (element wise) the gradient in the update rule. See \textcite{rmsprop-Hinton2012,deeplearningbook-GoodfellowEtAl2016} among others. The update rule using RMSprop for maximization\footnote{Instead of the more common minimization, therefore $+$ in the update of $\mat{\theta}$.} is
\begin{align*}
\mat{g}_2^{(I + 1)} &= \nu \mat{g}_2^{(I)} + (1 - \nu)\nabla l_n(\mat{\theta}^{(I)})\odot\nabla l_n(\mat{\theta}^{(I)}), \\
\mat{\theta}^{(I + 1)} &= \mat{\theta}^{(I)} + \frac{\lambda}{\sqrt{\mat{g}_2^{(I + 1)}} + \epsilon}\odot\nabla l_n(\mat{\theta}^{(I)}).
\end{align*}
The parameters $\nu = 0.9$, $\lambda = 10^{-3}$ and $\epsilon\approx 1.49\cdot 10^{-8}$ are fixed. The initial value of $\mat{g}_2^{(0)} = \mat{0}$, the symbol $\odot$ denotes the Hadamard product, that is the element wise multiplication. The divition and sqaure root operation are performed element wise as well. According to our experiments, RMSprop requires in the range of $50$ till $1000$ iterations till convergence while gradient ascent with a learning rate of $10^{-3}$ is in the range of $1000$ till $10000$. \todo{check this!}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Small Data Sets}\label{sec:ising-small-data-sets}
In case of a finite number of observations, specifically in data sets with a small number of observations $n$, the situation where one components is always ether zero or one can occure. Its also possible to observe two exclusive components. This situation of a ``degenerate'' data set needs to be saveguarded against in practive. Working with parameters on a log-scale, this gives estimates of $\pm\infty$. This is outside of the parameter space and breaks our optimization algorithm.
The first situation where this needs to be addressed is in \eqref{eq:ising-init-Omegas}, where we set initial estimates for $\mat{\Omega}_k$. To avoid divition by zero as well as evaluating the log of zero, we addapt \eqref{eq:ising-mode-moments}, the mode wise moment estimates $\hat{\mat{M}}_{2(k)}$. A simple method is to replace the ``degenerate'' components, that are entries with value $0$ or $1$, with the smallest positive estimate of exactly one occurence $p_k / n p$, or all but one occurence $1 - p_k / n p$, respectively.
The same problem arives in gradient optimization. Therefore, before starting the optimization, we detect degenerate combinations. We compute upper and lower bounds for the ``degenerate'' element in the Kronecker product $\hat{\mat{\Omega}} = \bigkron_{k = r}^{1}\hat{\mat{\Omega}}_k$. After every gradient update, we check if any of the ``degenerate'' elements fall outside of the bounds. In that case, we adjust all the elements of the Kronecker component estimates $\hat{\mat{\Omega}}_k$, corresponding to the ``degenerate'' element of their Kronecker product, to fall inside the precomputed bounds. While doing so, we try to alter every component as little as possible to ensure that the non-degenerate elements in $\hat{\mat{\Omega}}$, effected by this change due to its Kronecker structure, are altered as little as possible. The exact details are technically cumbersome while providing little insight. \todo{For more details we refer the reader to the source code prodived with the supplemental material.}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Slightly Bigger Dimensions}\label{sec:ising-bigger-dim}
A big challenge for the Ising model is its high computational complexity as it involves summing over all binary vectors of length $p = \prod_{k = 1}^{r}p_k$ in the partition function \eqref{eq:ising-partition-function}. Computing the partition function exactly requires to sum all $2^p$ binary vectors. For small dimensions, say $p\approx 10$, this is easily computed. Increasing the dimension beyond $20$ becomes extremely expensive while it is %absolutely
impossible for dimension bigger than $30$. Trying to avoid the evaluation of the log-likelihood and only computing its partial gradients via \cref{thm:grad} does not resolve the issue. The gradients require the inverse link, in other words the second moment \eqref{eq:ising-m2}, where, if dropping the scaling factor $p_0$, still involves to sum $2^p$ summands. Basically, with our model, this means that the optimization of the Ising model using exactly computed gradients is impossible for moderately sized problems.
For estimation of dimensions $p$ bigger than $20$, we use a Monte-Carlo method to estimate the second moment \eqref{eq:ising-m2}, required to compute the partial gradients of the log-likelihood. Specifically, we use a Gibbs-Sampler to sample from the conditional distribution and approximate the second moment in an importance sampling framework. This can be implemented quite efficiently while the estimation accuracy for the second moment is evaluated experimentally which seems to be very reliable. Simultaneously, we use the same approach to estimate the partition function. This though, is in comparison inaccurate, and may only be used to get a rough idea of the log-likelihood. Regardless, for our method, we only need the gradient for optimization where appropriate break conditions, not based on the likelihood, lead to a working method for MLE estimation.
\begin{figure}
\centering
\includegraphics[]{plots/sim-ising-perft-m2.pdf}
\caption{\label{fig:ising-m2-perft}Performance test for computing/estimating the second moment of the Ising model of dimension $p$ using ether the exact method or a Monte-Carlo simulation.}
\end{figure}
\newpage
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Statistical Properties}
\subsection{Kronecker Product Manifolds}\label{sec:kron-manifolds}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\cref{thm:sdr} identifies the sufficient reduction for the regression of $Y$ on $\ten{X}$ in the population. Any estimation of the sufficient reduction requires application of some optimality criterion. As we operate within the framework of the exponential family, we opted for maximum likelihood estimation (MLE). For the unconstrained problem, where the parameters are simply $\mat{B}$ and $\mat{\Omega}$ in \eqref{eq:eta1-manifold}, maximizing the likelihood of $\ten{X} \mid Y$ is straightforward and yields well-defined MLEs of both parameters. Our setting, though, requires the constrained optimization of the $\ten{X} \mid Y$ likelihood subject to $\mat{B} = \bigotimes_{j = r}^{1}\mat{\beta}_j$ and $\mat{\Omega}=\bigkron_{j = r}^{1}\mat{\Omega}_j$. \Cref{thm:kron-manifolds,thm:param-manifold} provide the setting for which the MLE of the constrained parameter $\mat{\theta}$ is well-defined, which in turn leads to the derivation of its asymptotic normality.
The main problem in obtaining asymptotic results for the MLE of the constrained parameter $\mat{\theta} = (\overline{\ten{\eta}}, \vec\mat{B}, \vech\mat{\Omega})$ stems from the nature of the constraint. We assumed that $\mat{B} = \bigkron_{k = r}^{1}\mat{\beta}_k$, where the parameter $\mat{B}$ is identifiable. This means that different values of $\mat{B}$ lead to different densities $f_{\mat{\theta}}(\ten{X}\mid Y = y)$, a basic property needed to ensure consistency of parameter estimates, which in turn is needed for asymptotic normality. On the other hand, the components $\mat{\beta}_j$, $j = 1, \ldots, r$, are \emph{not} identifiable, which is a direct consequence of the equality $\mat{\beta}_2\otimes\mat{\beta}_1 = (c\mat{\beta}_2)\otimes (c^{-1}\mat{\beta}_1)$ for every $c\neq 0$. This is the reason we formulated $\Theta$ as a constrained parameter space instead of parameterizing the densities of $\ten{X}\mid Y$ with respect to the components $\mat{\beta}_1, \ldots, \mat{\beta}_r$. The same is true for $\mat{\Omega} = \bigkron_{k = r}^{1}\mat{\Omega}_k$.
In addition to identifiable parameters, asymptotic normality obtained in \cref{thm:asymptotic-normality-gmlm} requires differentiation. Therefore, the space itself needs to admit defining differentiation, which is usually a vector space. This is too strong an assumption for our purposes. To weaken the vector space assumption we consider \emph{smooth manifolds}. The latter are spaces which look like Euclidean spaces locally and allow the notion of differentiation. The more general \emph{topological} manifolds are too weak for differentiation. To make matters worse, a smooth manifold only allows first derivatives. Without going into details, the solution is a \emph{Riemannian manifold}. Similar to an abstract \emph{smooth manifold}, Riemannian manifolds are detached from our usual intuition as well as complicated to handle in an already complicated setting. This is where an \emph{embedded (sub)manifold} comes to rescue. Simply speaking, an embedded manifold is a manifold which is a subset of a manifold from which it inherits its properties. If a manifold is embedded in a Euclidean space, almost all the complication of the abstract manifold theory simplifies drastically. Moreover, since a Euclidean space is itself a Riemannian manifold, we inherit the means for higher derivatives. Finally, smooth embedded submanifold structure for the parameter space maintains consistency with existing approaches and results for parameter sets with linear subspace structure. These reasons justify the constraint that the parameter space $\Theta$ be an \emph{smooth embedded submanifold} in an open subset $\Xi$ of a Euclidean space.
Now, we directly define a \emph{smooth manifold} embedded in $\mathbb{R}^p$ without any detours to the more generel theory. See for example \textcite{introToSmoothMani-Lee2012,,introToRiemannianMani-Lee2018,optimMatrixMani-AbsilEtAl2007,aufbauAnalysis-kaltenbaeck2021} among others.
\begin{definition}[Manifolds]\label{def:manifold}
A set $\manifold{A}\subseteq\mathbb{R}^p$ is an \emph{embedded smooth manifold} of dimension $d$ if for every $\mat{x}\in\manifold{A}$ there exists a smooth\footnote{Here \emph{smooth} means infinitely differentiable or $C^{\infty}$.} bi-continuous map $\varphi:U\cap\manifold{A}\to V$, called a \emph{chart}, with $\mat{x}\in U\subseteq\mathbb{R}^p$ open and $V\subseteq\mathbb{R}^d$ open.
\end{definition}
We also need the concept of a \emph{tangent space} to formulate asymptotic normality in a way which is independent of a particular coordinate representation. Intuitively, the tangent space at a point $\mat{x}\in\manifold{A}$ of the manifold $\manifold{A}$ is the hyperspace of all velocity vectors $\t{\nabla\gamma(0)}$ of any curve $\gamma:(-1, 1)\to\manifold{A}$ passing through $\mat{x} = \gamma(0)$, see \cref{fig:torus}. Locally, at $\mat{x} = \gamma(0)$ with a chart $\varphi$ we can written $\gamma(t) = \varphi^{-1}(\varphi(\gamma(t)))$ which gives that $\Span\t{\nabla\gamma(0)} \subseteq \Span\t{\nabla\varphi^{-1}(\varphi(\mat{x}))}$. Taking the union over all smooth curves through $\mat{x}$ gives equality. The following definition leverages the simplified setup of smooth manifolds in Euclidean space.
\begin{definition}[Tangent Space]\label{def:tangent-space}
Let $\manifold{A}\subseteq\mathbb{R}^p$ be an embedded smooth manifold and $\mat{x}\in\manifold{A}$. The \emph{tangent space} at $\mat{x}$ of $\manifold{A}$ is defined as
\begin{displaymath}
T_{\mat{x}}\manifold{A} := \Span\t{\nabla\varphi^{-1}(\varphi(\mat{x}))}
\end{displaymath}
for any chart $\varphi$ with $\mat{x}$ in the pre-image of $\varphi$.
\end{definition}
\Cref{def:tangent-space} is consistent since it can be shown that two different charts at the same point have identical span.
\begin{figure}
\centering
\includegraphics[width = 0.5\textwidth]{images/TorustangentSpace.pdf}
\caption{\label{fig:torus}Visualization of the tangent space $T_{\mat{x}}\manifold{A}$ at $\mat{x}$ of the torus $\manifold{A}$. The torus $\manifold{A}$ is a 2-dimensional embedded manifold in $\mathbb{R}^3$. The tangent space $T_{\mat{x}}\manifold{A}\subset\mathbb{R}^3$ is a the 2-dimensional hyperplane visualized with its origin $\mat{0}$ shifted to $\mat{x}$. Moreover, two curves $\gamma_1, \gamma_2$ on the torus are drawn with $\mat{x} = \gamma_1(0) = \gamma_2(0)$. The curve velocity vectors $\t{\nabla\gamma_1(0)}$ and $\t{\nabla\gamma_2(0)}$ are drawn as tangent vectors with root $\mat{x}$.}
\end{figure}
As a basis to ensure that the constrained parameter space $\Theta$ is a manifold, which is a requirement of \cref{thm:param-manifold}, we need \cref{thm:kron-manifolds}. Therefore, we need the notion of a \emph{spherical} set, which is a set $\manifold{A}$, on which the Frobenius norm is constant. That is, $\|\,.\,\|_F:\manifold{A}\to\mathbb{R}$ is constant. Forthermore, we call a scale invariant set $\manifold{A}$ a \emph{cone}, that is $\manifold{A} = \{ c \mat{A} : \mat{A}\in\manifold{A} \}$ for all $c > 0$.
\begin{theorem}[Kronecker Product Manifolds]\label{thm:kron-manifolds}
Let $\manifold{A}\subseteq\mathbb{R}^{p_1\times q_1}\backslash\{\mat{0}\}, \manifold{B}\subseteq\mathbb{R}^{p_2\times q_2}\backslash\{\mat{0}\}$ be smooth embedded submanifolds. Assume one of the following conditions holds.
\begin{itemize}
\item[-] ``sphere condition'':
At least one of $\manifold{A}$ or $\manifold{B}$ is \emph{spherical} and let $d = \dim\manifold{A} + \dim\manifold{B}$.
\item[-] ``cone condition'':
Both $\manifold{A}$ and $\manifold{B}$ are \emph{cones} and let $d = \dim\manifold{A} + \dim\manifold{B} - 1$.
\end{itemize}
Then, $\{ \mat{A}\otimes \mat{B} : \mat{A}\in\manifold{A}, \mat{B}\in\manifold{B} \}\subset\mathbb{R}^{p_1 p_2\times q_1 q_2}$ is a smooth embedded $d$-manifold.
\end{theorem}
\begin{theorem}[Parameter Manifold]\label{thm:param-manifold}
Let
\begin{displaymath}
\manifold{K}_{\mat{B}} = \Bigl\{ \bigkron_{k = r}^{1}\mat{\beta}_k : \mat{\beta}_k\in\manifold{B}_k \Bigr\}
\quad\text{and}\quad
\manifold{K}_{\mat{\Omega}} = \Bigl\{ \bigkron_{k = r}^{1}\mat{\Omega}_k : \mat{\Omega}_k\in\manifold{O}_k \Bigr\}
\end{displaymath}
where $\manifold{B}_k\subset\mathbb{R}^{p_k\times q_k}\backslash\{\mat{0}\}$ and $\manifold{O}_k\subset\mathbb{R}^{p_k\times p_k}\backslash\{\mat{0}\}$ are smooth embedded manifolds which are either spheres or cones, for $k = 1, ..., r$. Furthermore, let
\begin{displaymath}
\manifold{CK}_{\mat{\Omega}} = \{ \vech{\mat{\Omega}} : \mat{\Omega}\in\manifold{K}_{\mat{\Omega}} \land \pinv{(\mat{T}_2\pinv{\mat{D}_p})}\mat{T}_2\pinv{\mat{D}_p}\vec{\mat{\Omega}} = \vec{\mat{\Omega}} \}
\end{displaymath}
then the constrained parameter space $\Theta = \mathbb{R}^p \times \manifold{K}_{\mat{B}}\times\manifold{CK}_{\mat{\Omega}}\subset\mathbb{R}^{p(p + 2 q + 3) / 2}$ is a smooth embedded manifold.
\end{theorem}
\subsection{Matrix Manifolds}\label{sec:matrix-manifolds}
A powerful side effect of \cref{thm:param-manifold} is the modeling flexibinity it provides. For example, we can perform low rank regression. Or, we may constrain two-way interactions between direct axis neighbors by using band matrices for the $\mat{\Omega}_k$'s, among others.
This flexibility derives from many different matrix manifolds that can be used as building blocks $\manifold{B}_k$ and $\manifold{O}_k$ of the parameter space $\Theta$ in \cref{thm:param-manifold}. A list of possible choices, among others, is given in \cref{tab:matrix-manifolds}. As long as parameters in $\Theta$ are valid paramererization of a density (or PMF) of \eqref{eq:quadratic-exp-fam} subject to \eqref{eq:eta1-manifold} and \eqref{eq:eta2-manifold}, one may choose any of the manifolds listed in \cref{tab:matrix-manifolds} which are either cones or spherical. We also included an example which is neither a sphere nor a cone. They may also be valid building blocks, but require more work as they are not directly leading to a parameter manifold by \cref{thm:param-manifold}. In case one can show the resulting parameter space $\Theta$ is an embedded manifold, the asymptotic theory of \cref{sec:asymtotics} is applicable.
\begin{table}
\centering
\begin{tabular}{l | l | c c c}
Symbol & Description & C & S & Dimension\\ \hline
$\mathbb{R}^{p\times q}$ & All matrices of dimension $p\times q$ &
\checkmark & \xmark & $p q$ \\ \hline
$\mathbb{R}_{*}^{p\times q}$ & Full rank $p\times q$ matrices &
\checkmark & \xmark & $p q$ \\ \hline
$\Stiefel{p}{q}$ & \emph{Stiefel Manifold}, $\{ \mat{U}\in\mathbb{R}^{p\times q} : \t{\mat{U}}\mat{U} = \mat{I}_q \}$ for $q\leq p$ &
\xmark & \checkmark & $p q - q (q + 1) / 2$ \\ \hline
$\mathcal{S}^{p - 1}$ & Unit sphere in $\mathbb{R}^p$, special case $\Stiefel{p}{1}$ &
\xmark & \checkmark & $p - 1$ \\ \hline
$\UnitaryGrp{p}$ & Unitary Group, special case $\Stiefel{p}{p}$ &
\xmark & \checkmark & $p (p - 1) / 2$ \\ \hline
$\SpecialUnitaryGrp{p}$ & Special Unitary Group $\{ \mat{U}\in U(p) : \det{\mat{U}} = 1 \}$ &
\xmark & \checkmark & $p (p - 1) / 2$ \\ \hline
$\mathbb{R}_{r}^{p\times q}$ & Matrices of known rank $r > 0$, generalizes $\StiefelNonCompact{p}{q}$ &
\checkmark & \xmark & $r(p + q - r)$ \\ \hline
& Symmetric matrice &
\checkmark & \xmark & $p (p + 1) / 2$ \\ \hline
$\SymPosDefMat{p}$ & Symmetric Positive Definite matrices &
\checkmark & \xmark & $p (p + 1) / 2$ \\ \hline
& Scaled Identity $\{ a\mat{I}_p : a\in\mathbb{R}_{+} \}$ &
\checkmark & \xmark & $1$ \\ \hline
& Symmetric $r$-band matrices (includes diagonal) &
\checkmark & \xmark & $(2 p - r) (r + 1) / 2$ \\
& $\qquad\{ \mat{A}\in\mathbb{R}^{p\times p} : \mat{A} = \t{\mat{A}}\land \mat{A}_{i j} = 0\ \forall |i - j| > r \}$ \\ \hline
& Auto correlation alike $\{ \mat{A}\in\mathbb{R}^{p\times p} : \mat{A}_{i j} = \rho^{|i - j|}, \rho\in(0, 1) \}$ &
\xmark & \xmark & $1$ \\ \hline
\end{tabular}
\caption{\label{tab:matrix-manifolds}Examples of embedded matrix manifolds. ``Symbol'' a (more or less) common notation for the matrix manifold, if at all. ``C'' stands for \emph{cone}, meaning it is scale invariant. ``S'' means \emph{spherical}, that is, constant Frobenius norm.}
\end{table}
\begin{remark}
The \emph{Grassmann Manifold} of $q$ dimensional subspaces in $\mathbb{R}^p$ is not listed in \cref{tab:matrix-manifolds} since it is not embedded in $\mathbb{R}^{p \times q}$.
\end{remark}
\subsection{Asymptotics}\label{sec:asymtotics}
Let $Z$ be a random variable distributed according to a parameterized probability distribution with density $f_{\mat{\theta_0}}\in\{ f_{\mat{\theta}} : \mat{\theta}\in\Theta \}$ where $\Theta$ is a subset of a Euclidean space. We want to estimate the parameter ${\mat{\theta}}_0$ using $n$ i.i.d. (independent and identically distributed) copies of $Z$. We assume a known, real-valued and measurable function $z\mapsto m_{\mat{\theta}}(z)$ for every $\mat{\theta}\in\Theta$ and that ${\mat{\theta}}_0$ is the unique maximizer of the map $\mat{\theta}\mapsto M(\mat{\theta}) = \E m_{\mat{\theta}}(Z)$. For the estimation we maximize the empirical version
\begin{align}\label{eq:Mn}
M_n(\mat{\theta}) &= \frac{1}{n}\sum_{i = 1}^n m_{\mat{\theta}}(Z_i).
\end{align}
An \emph{M-estimator} $\hat{\mat{\theta}}_n = \hat{\mat{\theta}}_n(Z_1, ..., Z_n)$ is a maximizer for the objective function $M_n$ over the parameter space $\Theta$ defined as
\begin{displaymath}
\hat{\mat{\theta}}_n = \argmax_{\mat{\theta}\in\Theta} M_n(\mat{\theta}).
\end{displaymath}
It is not necessary to have a perfect maximizer, as long as the objective has finite supremum, it is sufficient to take an \emph{almost maximizer} $\hat{\mat{\theta}}_n$ as defined in the following;
\begin{definition}[weak and strong M-estimators]
An estimator $\hat{\mat{\theta}}_n$ for the objective function $M_n$ in \eqref{eq:Mn} with $\sup_{\mat{\theta}\in\Theta}M_n(\mat{\theta}) < \infty$ such that
\begin{displaymath}
M_n(\hat{\mat{\theta}}_n) \geq \sup_{\mat{\theta}\in\Theta}M_n(\mat{\theta}) - o_P(n^{-1})
\end{displaymath}
is called a \emph{strong M-estimator} over $\Theta$. Replacing $o_P(n^{-1})$ by $o_P(1)$ gives a \emph{weak M-estimator}.
\end{definition}
\begin{theorem}[Asymptotic Normality]\label{thm:asymptotic-normality-gmlm}
Assume $Z = (\ten{X}, Y)$ satisfies model \eqref{eq:quadratic-exp-fam} subject to \eqref{eq:eta1-manifold} and \eqref{eq:eta2-manifold} with true constrained parameter $\mat{\theta}_0 = (\overline{\eta}_0, \mat{B}_0, \mat{\Omega}_0)\in\Theta$, where $\Theta$ is defined in \cref{thm:param-manifold}. Under the regularity \crefrange{cond:differentiable-and-convex}{cond:finite-sup-on-compacta} in the appendix, there exists a strong M-estimator sequence $\hat{\mat{\theta}}_n$ deriving from $l_n$ in \eqref{eq:log-likelihood} over $\Theta$. Furthermore, any strong M-estimator $\hat{\mat{\theta}}_n$ converges in probability to the true parameter $\mat{\theta}_0$ over $\Theta$. That is, $ \hat{\mat{\theta}}_n\xrightarrow{p}\mat{\theta}_0$. Moreover, every strong M-estimator $\hat{\mat{\theta}}_n$ is asymptotically normal,
\begin{displaymath}
\sqrt{n}(\hat{\mat{\theta}}_n - \mat{\theta}_0) \xrightarrow{d} \mathcal{N}_{p(p + 2 q + 3) / 2}(0, \mat{\Sigma}_{\mat{\theta}_0})
\end{displaymath}
with asymptotic variance-covariance structure $\mat{\Sigma}_{\mat{\theta}_0}$ given in \eqref{eq:asymptotic-covariance-gmlm}.
\end{theorem}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Asymptotic Normality}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The following is a reformulation of \textcite[Lemma~2.3]{asymptoticMLE-BuraEtAl2018} which assumes Condition~2.2 to hold. The existence of a mapping in Condition~2.2 is not needed for Lemma~2.3. It suffices that the restricted parameter space $\Theta$ is a subset of the unrestricted parameter space $\Xi$, which is trivially satisfied in our setting. Under this, \cref{thm:exists-strong-M-estimator-on-subsets} follows directly from \textcite[Lemma~2.3]{asymptoticMLE-BuraEtAl2018}.
\begin{theorem}[Existence of strong M-estimators on Subsets]\label{thm:exists-strong-M-estimator-on-subsets}
Assume there exists a (weak/strong) M-estimator $\hat{\mat{\xi}}_n$ for $M_n$ over $\Xi$, then there exists a strong M-estimator $\hat{\mat{\theta}}_n$ for $M_n$ over any non-empty $\Theta\subseteq\Xi$.
\end{theorem}
\begin{theorem}[Existence and Consistency of M-estimators on Subsets]\label{thm:M-estimator-consistency-on-subsets}
Let $\Xi$ be a convex open subset of a Euclidean space and $\Theta\subseteq\Xi$ non-empty. Assume $\mat{\xi}\mapsto m_{\mat{\xi}}(z)$ is a strictly concave function on $\Xi$ for almost all $z$ and $z\mapsto m_{\mat{\xi}}(z)$ is measurable for all $\mat{\xi}\in\Xi$. Let $M(\mat{\xi}) = \E m_{\mat{\xi}}(Z)$ be a well defined function with a unique maximizer $\mat{\theta}_0\in\Theta\subseteq\Xi$; that is, $M(\mat{\theta}_0) > M(\mat{\xi})$ for all $\mat{\xi}\neq\mat{\theta}_0$. Also, assume
\begin{displaymath}
\E\sup_{\mat{\xi}\in K}|m_{\mat{\xi}}(Z)| < \infty,
\end{displaymath}
for every non-empty compact $K\subset\Xi$. Then, there exists a strong M-estimator $\hat{\mat{\theta}}_n$ of $M_n(\mat{\theta}) = \frac{1}{n}\sum_{i = 1}^{n} m_{\mat{\theta}}(Z_i)$ over the subset $\Theta$. Moreover, any strong M-estimator $\hat{\mat{\theta}}_n$ of $M_n$ over $\Theta$ converges in probability to $\mat{\theta}_0$, that is $\hat{\mat{\theta}}_n\xrightarrow{p}\mat{\theta}_0$.
\end{theorem}
\todo{The assumptions of the following can be a bit weakened, is this neccessary? For example the Hessian can be singular but is non-singular constrained to the tangent space. We can also only define $\mat{\theta}\mapsto m_{\mat{\theta}}$ only on the manifold which makes the statement much more technical, but way more general while we need to ensure that every smooth local extension of $\mat{\theta}\mapsto m_{\mat{\theta}}$ yields the same statement, which it does, but well, then it gets more complicated! Maybe add these things as a remark? The even more general formulation for Riemannian Manifolds is definitely over the top!}
\begin{theorem}[Asymptotic Normality for M-estimators on Manifolds]\label{thm:M-estimator-asym-normal-on-manifolds}
Let $\Theta\subseteq\mathbb{R}^p$ be a smooth embedded manifold. For each $\mat{\theta}$ in a neighborhood in $\mathbb{R}^p$ of the true parameter $\mat{\theta}_0\in\Theta$ let $z\mapsto m_{\mat{\theta}}(z)$ be measurable and $\mat{\theta}\mapsto m_{\mat{\theta}}(z)$ be differentiable at $\mat{\theta}_0$ for almost all $z$. Assume also that there exists a measurable function $u$ such that $\E[u(Z)^2] < \infty$, and for almost all $z$ as well as all $\mat{\theta}_1, \mat{\theta}_2$ in a neighborhood of $\mat{\theta}_0$ such that
\begin{displaymath}
| m_{\mat{\theta}_1}\!(z) - m_{\mat{\theta}_2}\!(z) | \leq u(z) \| \mat{\theta}_1 - \mat{\theta}_2 \|_2.
\end{displaymath}
Moreover, assume that $\mat{\theta}\mapsto\E[m_{\mat{\theta}}(Z)]$ admits a second-order Taylor expansion at $\mat{\theta}_0$ in a neighborhood of $\mat{\theta}_0$ in $\mathbb{R}^p$ with a non-singular Hessian $\mat{H}_{\mat{\theta}_0} = \nabla^2_{\mat{\theta}}\E[m_{\mat{\theta}}(Z)]|_{\mat{\theta} = \mat{\theta}_0}\in\mathbb{R}^{p\times p}$.
If $\hat{\mat{\theta}}_n$ is a strong M-estimator of $\mat{\theta}_0$ in $\Theta$, then $\hat{\mat{\theta}}_n$ is asymptotically normal
\begin{displaymath}
\sqrt{n}(\hat{\mat{\theta}}_n - \mat{\theta}_0) \xrightarrow{d} \mathcal{N}_p(\mat{0}, \mat{\Pi}_{\mat{\theta}_0} \E[\nabla_{\mat{\theta}} m_{\mat{\theta}_0}(Z)\t{(\nabla_{\mat{\theta}} m_{\mat{\theta}_0}(Z))}]\mat{\Pi}_{\mat{\theta}_0})
\end{displaymath}
where $\mat{\Pi}_{\mat{\theta}_0} = \mat{P}_{\mat{\theta}_0}\pinv{(\t{\mat{P}_{\mat{\theta}_0}}\mat{H}_{\mat{\theta}_0}\mat{P}_{\mat{\theta}_0})}\t{\mat{P}_{\mat{\theta}_0}}$ and $\mat{P}_{\mat{\theta}_0}$ is any matrix whose span is the tangent space $T_{\mat{\theta}_0}\Theta$ of $\Theta$ at $\mat{\theta}_0$.
\end{theorem}
\begin{remark}
\cref{thm:M-estimator-asym-normal-on-manifolds} has as special case Theorem~5.23 in \textcite{asymStats-van_der_Vaart1998}, when $\Theta$ is open subset of a Euclidean space as opposed to a smooth embedded manifold.
\todo{I don't like it that much, mention that an open set if an embedded manifold implying that $\mat{\Pi}_{\mat{\theta}_0} = \mat{H}_{\mat{\theta}_0}^{-1}$}
\end{remark}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Simulations}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
In this section we provide simulation results for the tensor normal as well as the Ising model where different aspects of the GMLM model are compaired against other methods. The comparison methods are tensor Sliced Inverse Regression (TSIR) \textcite{tsir-DingCook2015}, MGCCA \textcite{MGCCA-GirkaEtAl2024} and the Tucker decomposition that is a higher-order form of principal component analysis (HOPCA) \textcite{KoldaBader2009}, for both continuous and binary data. For the latter, the binary values are simply treated as continuous. As a base line we also include classic PCA on vectorized observations. \todo{check, fix, ...}
All experiments are performed with different sample sizes $n = 100, 200, 300, 500$ and $750$. Every experiment is repeated $100$ times.
We are interested in the quality of the estimate of the true sufficient reduction $\ten{R}(\ten{X})$ from \cref{thm:sdr}. Therefore, we compare with the true vectorized reduction matrix $\mat{B} = \bigkron_{k = r}^{1}\mat{\beta}_k$, as it is compatible with any linear reduction method. The distance $d(\mat{B}, \hat{\mat{B}})$ between $\mat{B}\in\mathbb{R}^{p\times q}$ and an estimate $\hat{\mat{B}}\in\mathbb{R}^{p\times \tilde{q}}$ is the \emph{subspace distance} which is proportional to
\begin{displaymath}
d(\mat{B}, \hat{\mat{B}}) \propto \| \mat{B}\pinv{(\t{\mat{B}}\mat{B})}\t{\mat{B}} - \hat{\mat{B}}\pinv{(\t{\hat{\mat{B}}}\hat{\mat{B}})}\t{\hat{\mat{B}}} \|_F,
\end{displaymath}
the Frobenius norm of the difference between the projections onto the span of $\mat{B}$ and $\hat{\mat{B}}$. The proportionality constant\footnote{Depends on row dimension $p$ and the ranks of $\mat{B}$ and $\hat{\mat{B}}$ given by $(\min(\rank\mat{B} + \rank\hat{\mat{B}}, 2 p - (\rank\mat{B} + \rank\hat{\mat{B}})))^{-1/2}$.} of $d(\mat{B}, \hat{\mat{B}})$ ensures that the subspace distance is in the interval $[0, 1]$. A distance of zero implies space overlap, a distance of one means that the subspaces are orthogonal.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Tensor Normal}\label{sec:sim-tensor-normal}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
For every tensor normal model we draw i.i.d. samples $\ten{X}_i$ for $i = 1, ..., n$ from the conditional distribution of $\ten{X}\mid Y = y_i$ where $y_i$ is an i.i.d. sample from the standard normal distribution. The conditional distribution $\ten{X}\mid Y = y_i$ depends on the choice of the GMLM parameters $\overline{\ten{\eta}}$, $\mat{\beta}_1, ..., \mat{\beta}_r$, $\mat{\Omega}_1, ..., \mat{\Omega}_r$, and the function $\ten{F}_y$ of $y$. In all experiments we set $\overline{\ten{\eta}} = \mat{0}$. The other parameters and $\ten{F}_y$ are described per experiment. With the true GMLM parameters and $\ten{F}_y$ given, we compute the conditional tensor normal mean $\ten{\mu}_y = \ten{F}_y\mlm_{k = 1}^{r}\mat{\Omega}_k^{-1}\mat{\beta}_k$ and covariances $\mat{\Sigma}_k = \mat{\Omega}_k^{-1}$ as in \eqref{eq:tnormal_cond_params}.
We start with a $1$ dimensional linear dependence on $y$ in 1a). Then, the dependence of $y$ is via a cubic polynomial 1b-d). In 1b) reduction is full rank, in constrast to 1c) where the $\mat{\beta}_k$'s are of rank $1$, in other words, low rank regression. In 1d) we constrain the inverse covariances $\mat{\Omega}_k$ to be tri-diagonal. Both, 1c-d) are examples of building the parameter space according to \cref{thm:param-manifold}. The final tensor normal experiment 1e) is a model missspecification. The true model does \emph{not} have a Kronecker structure and the ``known'' function $\ten{F}_y$ of $y$ is missspecified as well.
\begin{itemize}
\item[1a)] The predictors $\ten{X}$ are $2\times 3\times 5$ dimensional, that is $r = 3$. The dependence through the inverse regression model is linear specifically means that $\ten{F}_y\equiv y$ is a $1\times 1\times 1$ tensor. The true $\mat{\beta}_k$'s are all equal to $\mat{e}_1\in\mathbb{R}^{p_k}$, the first unit vector, for $k \in \{1, 2, 3\}$. The matrices $\mat{\Omega}_k = \mathrm{AR}(0.5)$ follow an auto-regression like structure. That is, the elements are given by $(\mat{\Omega}_k)_{i j} = 0.5^{|i - j|}$.
\item[1b)] The predictors $\ten{X}$ are again $3$ dimensional with dimension $2\times 3\times 5$ which relates to the response $y$ via a qubic polynomial. This is modeled via $\ten{F}_y$ of dimension $2\times 2\times 2$ by the twice iterated outer product of the vector $(1, y)$. Element wise this reads $(\ten{F}_y)_{i j k} = y^{i + j + k - 3}$. All $\mat{\beta}_k$'s are set to $(\mat{e}_1, \mat{e}_2)\in\mathbb{R}^{p_k\times 2}$ with $\mat{e}_i$ the $i$'th unit vector and the $\mat{\Omega}_k$'s are $\mathrm{AR}(0.5)$.
\item[1c)] Similar to 1b), except that the GMLM parameters $\mat{\beta}_k$ are rank $1$ given by
\begin{displaymath}
\mat{\beta}_1 = \begin{pmatrix} 1 & -1 \\ -1 & 1 \end{pmatrix},\quad
\mat{\beta}_2 = \begin{pmatrix} 1 & -1 \\ -1 & 1 \\ 1 & -1 \end{pmatrix},\quad
\mat{\beta}_3 = \begin{pmatrix} 1 & -1 \\ -1 & 1 \\ 1 & -1 \\ -1 & 1 \\ 1 & -1 \end{pmatrix}.
\end{displaymath}
\item[1d)] Again like 1b). This time the true $\mat{\Omega}_k$'s, for $k = 1, 2, 3$, are tri-diagonal. Their elements are given by $(\mat{\Omega}_k)_{i j} = \delta_{0, |i - j|} + 0.5\delta_{1, |i - j|}$ with $\delta_{i, j}$ being the Kronecker delta.
\item[1e)] For the missspecification model we let $\ten{X}\mid Y$ be multivariate normal but \emph{not} tensor normal. Let $\ten{X}$ be $5\times 5$ dimensional, $Y$ is univariate standard normal and $\mat{f}_y$ is a $4$ dimensional vector given by $\mat{f}_y = (1, \sin(y), \cos(y), \sin(y)\cos(y))$. The true vectorized reduction matrix $\mat{B}$ is $25\times 4$ consisting of the first $4$ columns of the identify. The variance-covariance matrix $\mat{\Sigma}$ is an auto-regression like structure with correlation coefficient $0.5$. Element wise $\mat{B}_{i j} = \delta_{i j}$ and $\mat{\Sigma}_{i j} = 0.5^{|i - j|}$. Both, $\mat{B}$ and $\mat{\Omega} = \mat{\Sigma}^{-1}$ violate the Kronecker product assumptions \eqref{eq:eta1} and \eqref{eq:eta2} of the GMLM model. Then, we set
\begin{displaymath}
\vec{\ten{X}}\mid (Y = y) = \mat{B}\mat{f}_y + \mathcal{N}_{25}(\mat{0}, \mat{\Sigma}).
\end{displaymath}
Furthermore, we fit the model with the wrong ``known'' function $\ten{F}_y$. We set $\ten{F}_y$ to be a $2\times 2$ matrix with a quadratic linkage via elements given by $(\ten{F}_y)_{i j} = y^{|i - j|}$.
\end{itemize}
\todo{How to describe that? I mean, sure, but what to write?}
The results of 1c) are surprising. The GMLM model behaves as expected, clearly being the best. The first surprise is that PCA, HOPCA and MGCCA are visually indistinguishable. This is explained by a high signal to noise ration in this particular example. But the biggest surprise is the failure of TSIR. It turns out that TSIR is usualy well equiped to handle those specific low rank problems (given the true rank of the problem which is the case for all methods in every simulation), but by pure coincidense we picked a case where TSIR failes. Intending to pinpoint the specific problem we made another simulation where we change the tensor order $r$ from $2$ till $4$. Furthermore, we altered the coefficient $\rho$ for the auto regression type matrices $(\mat{\Omega}_k)_{i j} = \rho^{|i - j|}$. We let $\ten{F}_y$ be the $r$ times iterated outer product of the vector $(1, y)$. In the case of $r = 3$ this given the same $\ten{F}_y$ as in 1c). Then, we setup two scenarios where the definition of the true $\mat{\beta}_k$'s of rank $1$, for $k = 1, \ldots, r$, are different. The rest is identical to simulation 1c).
\begin{itemize}
\item[V1)] The first version sets all $\mat{\beta}_k$'s identical to
\begin{displaymath}
\mat{\beta}_k = \begin{pmatrix} 1 & 1 \\ 0 & 0 \end{pmatrix}
\end{displaymath}
which gives the true vectorized reduction matrix $\mat{B} = \bigkron_{k = r}^{1}\mat{\beta}_k$ equal to a $2^r\times 2^r$ rank $1$ matrix with the first row all ones and the rest filled with zeros. The minimal true reduction is the $2^r$ dimensional first unit vector $\mat{e}_1$. In this setting the vectorized expected value is given by $\E[\vec{\ten{X}} \mid Y = y] = (1 + y)^r \bigkron_{k = r}^{1}\mat{\Omega}_k^{-1}$
\begin{displaymath}
\mat{\Omega}_k = \begin{pmatrix}
1 & \rho \\ \rho & 1
\end{pmatrix}
\end{displaymath}
\begin{displaymath}
\mat{\Sigma}_k = \mat{\Omega}_k^{-1} = \frac{1}{1 - \rho^2}\begin{pmatrix}
1 & -\rho \\ -\rho & 1
\end{pmatrix}
\end{displaymath}
\begin{displaymath}
\E[\ten{X} \mid Y = y] = \frac{(1 + y)^r}{(1 - \rho^2)^r}\bigouter_{k = 1}^{r}\begin{pmatrix}
1 \\ -\rho
\end{pmatrix}
\end{displaymath}
\begin{displaymath}
\E[\vec{\ten{X}} \mid Y = y] = \frac{(1 + y)^r}{(1 - \rho^2)^r}\bigkron_{k = r}^{1}\begin{pmatrix}
1 \\ -\rho
\end{pmatrix}
\end{displaymath}
In this setting only the first component of $\ten{X}$, that is $(\vec{\ten{X}})_1$, depends directly on $Y$ via $\E[(\vec{\ten{X}})_1\mid Y = y] = (1 + y)^r$. All other components contain information about $Y$ through the correlation structure only. \todo{check this!}
\item[V2)] Similar to the $\mat{\beta}_k$'s in 1c), we set all $\mat{\beta}_k$'s identical to
\begin{displaymath}
\mat{\beta}_k = \begin{pmatrix} 1 & -1 \\ -1 & 1 \end{pmatrix}
\end{displaymath}
\begin{displaymath}
\E[\vec{\ten{X}}\mid Y = y] = \frac{(1 - y)^r}{(1 - \rho)^r}\bigkron_{k = r}^{1}\begin{pmatrix}
1 \\ -1
\end{pmatrix}
\end{displaymath}
\end{itemize}
% simplified the simulation such that $p_k = q_k = 2$ for $k = 1, \ldots, r$. We let the functions $\ten{F}_y = \bigcirc_{k = 1}^{r}(1, y)$
% Then, we simulate with $100$ replications per case where every case
% The setup which is ídentical in both cases is as follows. The response $Y$ is i.i.d. standard normal and the response tensor $\ten{F}_y$ consists of monomials with max order $r$. Its elements are equal to $(\ten{F}_y)_{\mat{i}} = y^{|\mat{i}| - r}$ where $\mat{i}$ is a multi-index of length $r$ and $|mat{i}|$ is the sum of th elements of $\mat{i}$.
\begin{figure}[hp!]
\centering
\includegraphics[width = \textwidth]{plots/sim-normal.pdf}
\caption{\label{fig:sim-normal}Visualization of the simulation results for the tensor normal GMLM. Sample size on the $x$-axis and the mean of subspace distance $d(\mat{B}, \hat{\mat{B}})$ over $100$ replications on the $y$-axis. Described in \cref{sec:sim-tensor-normal}.}
\end{figure}
\begin{figure}[ht!]
\centering
\includegraphics[width = \textwidth]{plots/sim-tsir.pdf}
\caption{\label{fig:sim-tsir}Simulation to investiage the unexpected failure of TSIR in simulation 1c.}
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Ising Model}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{figure}[ht!]
\centering
\includegraphics[width = \textwidth]{plots/sim-ising.pdf}
\caption{\label{fig:sim-ising}asknclknasknc}
\end{figure}
% \begin{table}
% \begin{tabular}{c | ccc ccc c}
% $n$ & GMLM & PCA & HOPCA & LPCA & CLPCA & TSIR & MGCCA \\
% \hline
% 100 & {\bf 0.34} (0.14) & 0.90 (0.04) & 0.90 (0.05) & 0.94 (0.09) & 0.91 (0.03) & 0.48 (0.19) & 0.55 (0.13) \\
% 200 & {\bf 0.25} (0.11) & 0.90 (0.03) & 0.90 (0.03) & 0.96 (0.07) & 0.91 (0.02) & 0.38 (0.16) & 0.53 (0.10) \\
% 300 & {\bf 0.20} (0.09) & 0.89 (0.02) & 0.89 (0.02) & 0.97 (0.06) & 0.91 (0.02) & 0.29 (0.13) & 0.51 (0.11) \\
% 500 & {\bf 0.16} (0.07) & 0.90 (0.02) & 0.90 (0.02) & 0.98 (0.01) & 0.91 (0.01) & 0.23 (0.10) & 0.50 (0.08) \\
% 750 & {\bf 0.13} (0.05) & 0.90 (0.01) & 0.90 (0.01) & 0.98 (0.02) & 0.91 (0.01) & 0.23 (0.08) & 0.53 (0.06)
% \end{tabular}
% \caption{\label{tab:sim-ising}xyz uvw}
% \end{table}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Data Analysis}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
In this section be perform two \todo{realy two!} applications of the GMLM model on real data. First example is the tensor normal model applied to EEG data\todo{do this!}. Next, we perform a prove of concept data analysis example for chess. The main purpose of choosing chess is two fold. First, we can ilustrate an explicit use case for the (till now ignored) linear constraint matrix $\mat{T}_2$. Second, its a personal interest of one of authors.\todo{???}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{EEG}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Chess}\label{sec:chess}
The data set is provided by the \citetitle{lichess-database}\footnote{\fullcite{lichess-database}}. We downloaded November of 2023 consisting of more than $92$ million games. We removed all games without position evaluations. The evaluations, also denoted scores, are from Stockfish,\footnote{\fullcite{stockfish}} a free and strong chess engine. The scores take the role of the response $Y$ and correspond to a winning probability from whites point of few. Positive scores are good for white and negative scores indicate an advantage for black. We ignore all highly unbalanced positions, which we set to be positions with absolute score above $5$. We also remove all positions with a mate score (one side can force check mate). Furthermore, we only consider positions after $10$ half-moves to avoid oversampling the beginning of the most common openings including the start position which is in every game. Finally, we only consider positions with white to move. This leads to a final data set of roughly $64$ million positions, including duplicates.
A chess position is encoded as a set of $12$ binary matrices $\ten{X}_{\mathrm{piece}}$ of dimensions $8\times 8$. Every binary matrix encodes the positioning of a particular piece by containing a $1$ if the piece is present at the corresponding board position. The $12$ pieces derive from the $6$ types of pieces, namely pawns (\pawn), knights (\knight), bishops (\bishop), queens (\queen) and kings (\king) of two colors, black and white. See \cref{fig:fen2tensor} for a visualization.
\begin{figure}[hp!]
\centering
\includegraphics[width = \textwidth]{images/fen2tensor.pdf}
\caption{\label{fig:fen2tensor}The chess start position and its 3D binary tensor representation, empty entries are $0$.}
\end{figure}
We assume that $\ten{X}_{\mathrm{piece}}\mid Y = y$ follows an Ising GMLM model \cref{sec:ising_estimation} with different conditional piece predictors being independent. The independence assumption is for the sake of simplicity even though this is clearly not the case in the underlying true distribution. See \cref{sec:chess-discussion} for a short discussion of improvements. By this simplifying assumption we get a mixture model with the log-likelihood
\begin{displaymath}
l_n(\mat{\theta}) = \frac{1}{12}\sum_{\mathrm{piece}}l_n(\mat{\theta}_{\mathrm{piece}})
\end{displaymath}
where $l_n(\mat{\theta}_{\mathrm{piece}})$ is the Ising GMLM log-likelihood as in \cref{sec:ising_estimation} for $\ten{X}_{\mathrm{piece}}\mid Y = y$. For every component the same relation to the scores $y$ is modeled via a $2\times 2$ dimensional matrix valued function $\ten{F}_y$ consisting of the monomials $1, y, y^2$, specifically $(\ten{F}_y)_{i j} = y^{i + j - 2}$.
By the raw scale of the data, millions of observations, it is computationally infeasible to compute the gradients on the entire data set. Simply using a computationally manageable subset is not an option. Due to the high dimension on binary data, which is $12$ times a $8\times 8$ for every observation giving a total dimension of $768$. The main issue is that a manageable subset, say one million observations, still leads to a degenerate data set. In our simplified mixture model, the pawns are a specific issue as there are multiple millions of different combinations of the $8$ pawns per color on the $6\times 8$ sub grid the pawns can be positioned. This allown does not allow to take a reasonable sized subset for estimation. The solution is to switch from a classic gradient based optimization to a stochastic version. This means that every gradient update uses a new random subset of the entire data set. Therefore, we draw independent random samples form the data consisting of $64$ million positions. The independence of samples derived from the independence of games, and every sample is drawn from a different game.
\paragraph{Validation:}
Given the non-linear nature of the reduction, due to the quadratic matrix valued function $\ten{F}_y$ of the score $y$, we use a \emph{generalized additive model}\footnote{using the function \texttt{gam()} from the \texttt{R} package \texttt{mgcv}.} (GAM) to predict position scores from reduced positions. The reduced positions are $48$ dimensional continuous values by combining the $12$ mixture components from the $2\times 2$ matrix valued reductions per piece. The per piece reduction is
\begin{displaymath}
\ten{R}(\ten{X}_{\mathrm{piece}}) = \mat{\beta}_{1,\mathrm{piece}}(\ten{X}_{\mathrm{piece}} - \E\ten{X}_{\mathrm{piece}})\t{\mat{\beta}_{2, \mathrm{piece}}}
\end{displaymath}
which gives the complete $48$ dimensional vectorized reduction by stacking the piece wise reductions
\begin{displaymath}
\vec{\ten{R}(\ten{X}})
= (\vec{\ten{R}(\ten{X}_{\text{white pawn}})}, \ldots, \vec{\ten{R}(\ten{X}_{\text{black king}})})
= \t{\mat{B}}\vec(\ten{X} - \E\ten{X}).
\end{displaymath}
The second line encodes all the piece wise reductions in a block diagonal full reduction matrix $\mat{B}$ of dimension $768\times 48$ which is applied to the vectorized 3D tensor $\ten{X}$ combining all the piece components $\ten{X}_{\mathrm{piece}}$ into a single tensor of dimension $8\times 8\times 12$. This is a reduction to $6.25\%$ of the original dimension. The $R^2$ statistic of the GAM fitted on $10^5$ new reduced samples is $R^2_{gam}\approx 42\%$. A linear model on the reduced data achieves $R^2_{lm}\approx 25\%$ which clearly shows the non-linear relation. On the other hand, the static evaluation of the \emph{Schach H\"ornchen}\todo{ref/mention/?!?} engine, given the full position (\emph{not} reduced), achieves an $R^2_{hce}\approx 51\%$. The $42\%$ are reasonably well comparied to $51\%$ of the engine static evaluation which gets the original position and uses chess specific expect knowledge. Features the static evaluation includes, which are expected to be learned by the GMLM mixture model, are; \emph{material} (piece values) and \emph{piece square tables} (PSQT, preferred piece type positions). In addition, the static evaluation includes chess specific features like \emph{king safety}, \emph{pawn structure} or \emph{rooks on open files}. This lets us conclude that the reduction captures most of the relevant features possible, given the oversimplified modeling we performed.
\paragraph{Interpretation:} For a compact interpretation of the estimated reduction we construct PSQTs. To do so we use the linear model from the validation section. Then, we rewrite the combined linear reduction and linear model in terms of PSQTs. Let $\mat{B}$ be the $768\times 48$ full vectorized linear reduction. This is the block diagonal matrix with the $64\times 4$ dimensional per piece reductions $\mat{B}_{\mathrm{piece}} = \mat{\beta}^{\mathrm{piece}}_2\otimes\mat{\beta}^{\mathrm{piece}}_1$. Then, the linear model with coefficients $\mat{b}$ and intercept $a$ on the reduced data is given by
\begin{equation}\label{eq:chess-lm}
y = a + \t{\mat{b}}\t{\mat{B}}\vec(\ten{X} - \E\ten{X}) + \epsilon
\end{equation}
with an unknown mean zero error term $\epsilon$ and treating the binary tensor $\ten{X}$ as continuous. Decomposing the linear model coefficients into blocks of $4$ gives per piece coefficients $\mat{b}_{\mathrm{piece}}$ which combine with the diagonal blocks $\mat{B}_{\mathrm{piece}}$ of $\mat{B}$ only. Rewriting \eqref{eq:chess-lm} gives
\begin{align*}
y
&= a + \sum_{\mathrm{piece}}\t{(\mat{B}_{\mathrm{piece}}\mat{b}_{\mathrm{piece}})}\vec(\ten{X}_{\mathrm{piece}} - \E\ten{X}_{\mathrm{piece}}) + \epsilon \\
&= \tilde{a} + \sum_{\mathrm{piece}}\langle
\mat{B}_{\mathrm{piece}}\mat{b}_{\mathrm{piece}},
\vec(\ten{X}_{\mathrm{piece}})
\rangle + \epsilon
\end{align*}
with a new intercept term $\tilde{a}$, which is of no interest to us. Finally, we enforce a color symmetry, using known mechanism from chess engines. Specifically, mirroring the position changes the sign of the score $y$. Here, mirroring reverses the rank (row) order, this is the image in a mirror behind a chess board. Let for every $\mat{C}_{\mathrm{piece}}$ be a $8\times 8$ matrix with elements $(\mat{C}_{\mathrm{piece}})_{i j} = (\mat{B}_{\mathrm{piece}}\mat{b}_{\mathrm{piece}})_{i + 8 (j - 1)}$. And denote with $\mat{M}(\mat{A})$ the matrix mirror operation which reverses the row order of a matrix. Using this new notation allows to enforcing this symmetry leading to the new approximate linear relation
\begin{align*}
y &= \tilde{a} + \sum_{\mathrm{piece}}\langle
\mat{C}_{\mathrm{piece}},
\ten{X}_{\mathrm{piece}}
\rangle + \epsilon \\
&\approx \tilde{a} + \sum_{\text{piece type}}\frac{1}{2}\langle
\mat{C}_{\text{white piece}} - \mat{M}(\mat{C}_{\text{black piece}}),
\ten{X}_{\text{white piece}} - \mat{M}(\ten{X}_{\text{white piece}})
\rangle + \epsilon
\end{align*}
If for every piece type ($6$ types, \emph{not} distinguishing between color) holds $\mat{C}_{\text{white piece}} = -\mat{M}(\mat{C}_{\text{black piece}})$, then we have equality. In our case this is valid given that the estimates $\hat{\mat{C}}_{\mathrm{piece}}$ fulfill this property with a small error. The $6$ matrices $(\mat{C}_{\text{white piece}} - \mat{M}(\mat{C}_{\text{black piece}})) / 2$ are called \emph{piece square tables} (PSQT) which are visualized in \cref{fig:psqt}. The interpretation of those tables is straight forward. A high positive values (blue) means that it is usually good to have a piece of the corresponding type on that square while a high negative value (red) means the opposite. It needs to be considered that the PSQTs are for quiet positions only, that means all pieces are save in the sense that there is no legal capturing moves nore is the king in check.
\begin{figure}[hp!]
\centering
\includegraphics[width = \textwidth]{plots/psqt.pdf}
\caption{\label{fig:psqt}Extracted PSQTs (piece square tables) from the chess example GMLM reduction.}
\end{figure}
The first visual effect in \cref{fig:psqt} is the dark blue PSQT of the Queen followed by a not so dark Rook PSQT. This indicated that the Queen, followed by the Rook, are the most value pieces (after the king, but a king piece value makes no sense). The next two are the Knight and Bishop which have higher value than the Pawns, ignoring the $6$th and $7$th rank as this makes the pawns a potential queen. This is the classic piece value order known in chess.
Next, goint one by one through the PSQTs, a few words about the prefered positions for every piece type. The pawn positions are specifically good on the $6$th and especially on the $7$th rank as this threatens a promotion to a Queen (or Knight, Bishop, Rook). The Knight PSQT is a bit surprising, the most likely explanation for the knight being good in the enemy territory is that it got there by capturing an enemy piece for free. A common occurency in low rated games which is a big chunk of the training data, ranging over all levels. The Bishops sem to have no specific prefered placement, only a slight higher overall value than pawns, excluding pawns iminent of a promotion. Continuing with the rooks, we see that the rook is a good attacking piece, indicated by a save rook infiltration. The Queen is powerfull almost everywhere, only the outer back rank squares (lower left and right) tend to reduce her value. This is rooted in the queen being there is a sign for being pushed by enemy pieces. Leading to a lot of squares being controled by the enemy hindering one own movement. Finally, the king, given the goal of the game is to checkmate the king, a save position for the king is very valuable. This is seen by the back rank (rank $1$) being the only non-penalized squares. Furthermore, the most save squares are the castling target squares ($g1$ and $c1$) as well as the $b1$ square. Shifting the king over to $b1$ is quite common protecting the $a2$ pawn providing a complete protected pawn shield infront of the king.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\appendix
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Tensor Calculus and Multi Linear Algebra}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{displaymath}
(\ten{A}\circ\ten{B})\mlm_{k = 1}^{r + s} \mat{C}_k
=
\Bigl(\ten{A}\mlm_{k = 1}^r \mat{C}_k\Bigr)\circ\Bigl(\ten{B}\mlm_{l = 1}^s \mat{C}_{l + r}\Bigr)
\end{displaymath}
Using $\K(\ten{A}\circ\ten{B}) = \ten{A}\otimes\ten{B}$ gives
\begin{displaymath}
\K\Bigl((\ten{A}\circ\ten{B})\mlm_{k = 1}^{r + s} \mat{C}_k\Bigr)
=
\Bigl(\ten{A}\mlm_{k = 1}^r \mat{C}_k\Bigr)\otimes\Bigl(\ten{B}\mlm_{l = 1}^s \mat{C}_{l + r}\Bigr)
\end{displaymath}
A generalization of the well known identity $\vec(\mat{A}\mat{B}\mat{C}) = (\t{\mat{C}}\otimes\mat{A})\vec{\mat{B}}$ is given by
\begin{displaymath}
\Bigl(\ten{A}\mlm_{k = 1}^r \mat{B}_k \Bigr)_{(\mat{i}, \mat{j})}
=
\Bigl( \bigotimes_{k = \#\mat{i}}^{1}\mat{B}_{\mat{i}_k} \Bigr)
\ten{A}_{(\mat{i}, \mat{j})}
\Bigl( \bigotimes_{l = \#\mat{j}}^{1}\t{\mat{B}_{\mat{j}_l}} \Bigr)
\end{displaymath}
with the special case
\begin{displaymath}
\vec\Bigl(\ten{A}\mlm_{k = 1}^r \mat{B}_k\Bigr)
=
\Bigl(\bigotimes_{k = r}^{1}\mat{B}_k\Bigr)\vec{\ten{A}}
\end{displaymath}
Furthermore, we have
\begin{displaymath}
(\ten{A}\otimes\ten{B})\mlm_{k = 1}^{r}\t{(\vec\mat{C}_k)}
=
\Bigl\langle \ten{A}\mlm_{k = 1}^{r} \mat{C}_k, \ten{B} \Bigr\rangle
=
\Bigl\langle \ten{A}, \ten{B}\mlm_{k = 1}^{r} \t{\mat{C}_k} \Bigr\rangle
=
\t{(\vec{\ten{B}})}\Bigl(\bigotimes_{k = r}^{1}\mat{C}_k\Bigr)\vec{\ten{A}}
\end{displaymath}
as well as for any tensor $\ten{A}$ of even order $2 r$ and matching square matrices $\mat{B}_k$ holds
\begin{displaymath}
\K(\ten{A})\mlm_{k = 1}^{r}\t{(\vec\mat{B}_k)}
=
\t{(\vec{\ten{A}})}\vec\Bigl(\bigotimes_{k = r}^{1}\t{\mat{B}_k}\Bigr)
\end{displaymath}
% \begin{lemma}\label{thm:kron-perm}
% Given $r$ matrices $\mat{A}_k$ of dimension $p_j\times q_j$ for $k = 1, \ldots, r$, then there exists a unique permutation matrix $\mat{S}_{\mat{p}, \mat{q}}$ such that
% \begin{equation}\label{eq:kron-to-outer-perm}
% \vec\bigkron_{k = r}^{1}\mat{A}_j = \mat{S}_{\mat{p}, \mat{q}}\vec\bigouter_{k = 1}^{r}\mat{A}_k.
% \end{equation}
% The permutation $\mat{S}_{\mat{p}, \mat{q}}$ with indices $\mat{p} = (p_1, \ldots, p_r)$ and $\mat{q} = (q_1, \ldots, q_r)$ is the matrix-matrix product of $r - 1$ permutation matrices given by
% \begin{multline}\label{eq:S_pq}
% \mat{S}_{\mat{p}, \mat{q}} =
% \Bigl[ \mat{I}_1\otimes \Bigl( \mat{I}_{\prod_{k = r}^{2}q_k}\otimes\mat{K}_{q_1, \prod_{k = r}^{2}p_k}\otimes I_{p_1} \Bigr)\Bigr] \\
% \Bigl[ \mat{I}_{p_1 q_1}\otimes \Bigl( \mat{I}_{\prod_{k = r}^{3}q_k}\otimes\mat{K}_{q_2, \prod_{k = r}^{3}p_k}\otimes I_{p_2} \Bigr) \Bigr]
% \cdots
% \Bigl[ \mat{I}_{\prod_{k = 1}^{r - 2}p_k q_k}\otimes \Bigl( \mat{I}_{q_r}\otimes\mat{K}_{q_{r - 1}, p_r}\otimes I_{p_{r - 1}} \Bigr) \Bigr]
% \end{multline}
% where $\mat{K}_{p, q}$ is the \emph{commutation matrix} from \textcite[Ch.~11]{MatrixAlgebra-AbadirMagnus2005}, that is the permutation such that $\vec{\t{\mat{A}}} = \mat{K}_{p, q}\vec{\mat{A}}$ for every $p\times q$ dimensional matrix $\mat{A}$.
% \end{lemma}
% \begin{proof}
% \textcite[Lemma~7]{SymMatandJacobians-MagnusNeudecker1986} states that
% \begin{align*}
% \vec(\mat{A}_2\otimes\mat{A}_1)
% &= (\mat{I}_{q_2}\otimes\mat{K}_{q_1, p_2}\otimes\mat{I}_{p_1})(\vec{\mat{A}_2}\otimes\vec{\mat{A}_1}) \\
% &= (\mat{I}_{q_2}\otimes\mat{K}_{q_1, p_2}\otimes\mat{I}_{p_1})\vec(\mat{A}_1\circ \mat{A}_2).
% \end{align*}
% This proves the statement for $r = 2$. The general statement for $r > 2$ follows via induction using \textcite[Lemma~7]{SymMatandJacobians-MagnusNeudecker1986} in conjunction with $\vec(\mat{C}\mat{a}\t{\mat{b}}) = (\mat{I}_{\dim(\mat{b})}\otimes\mat{C})\vec(\mat{a}\t{\mat{b}})$.
% \end{proof}
\begin{lemma}\label{thm:kron-perm}
Given $r \geq 2$ matrices $\mat{A}_k$ of dimension $p_j\times q_j$ for $k = 1, \ldots, r$, then there exists a unique permutation matrix $\mat{S}_{\mat{p}, \mat{q}}$ such that
\begin{equation}\label{eq:kron-to-outer-perm}
\vec\bigkron_{k = r}^{1}\mat{A}_k = \mat{S}_{\mat{p}, \mat{q}}\vec\bigouter_{k = 1}^{r}\mat{A}_k.
\end{equation}
The permutation $\mat{S}_{\mat{p}, \mat{q}}$ with indices $\mat{p} = (p_1, \ldots, p_r)$ and $\mat{q} = (q_1, \ldots, q_r)$ is defined recursively as
\begin{equation}\label{eq:S_pq}
\mat{S}_{\mat{p}, \mat{q}} = \mat{S}_{\bigl( \prod_{k = 1}^{r - 1}p_k, p_r \bigr), \bigl( \prod_{k = 1}^{r - 1}q_k, q_r \bigr)} \bigl(\mat{I}_{p_r q_r}\otimes\mat{S}_{(p_1, \ldots, p_{r-1}), (q_1, \ldots, q_{r-1})}\bigr)
\end{equation}
with initial value
\begin{displaymath}
\mat{S}_{(p_1, p_2), (q_1, q_2)} = \mat{I}_{q_2}\otimes\mat{K}_{q_1, p_2}\otimes\mat{I}_{p_1}
\end{displaymath}
where $\mat{K}_{p, q}$ is the \emph{commutation matrix} from \textcite[Ch.~11]{MatrixAlgebra-AbadirMagnus2005}, that is the permutation such that $\vec{\t{\mat{A}}} = \mat{K}_{p, q}\vec{\mat{A}}$ for every $p\times q$ dimensional matrix $\mat{A}$.
\end{lemma}
\begin{proof}
\textcite[Lemma~7]{SymMatandJacobians-MagnusNeudecker1986} states that
\begin{align}
\vec(\mat{A}_2\otimes\mat{A}_1)
&= (\mat{I}_{q_2}\otimes\mat{K}_{q_1, p_2}\otimes\mat{I}_{p_1})(\vec{\mat{A}_2}\otimes\vec{\mat{A}_1}) \label{eq:MagnusNeudecker1986-vec-kron-identity} \\
&= (\mat{I}_{q_2}\otimes\mat{K}_{q_1, p_2}\otimes\mat{I}_{p_1})\vec(\mat{A}_1\circ \mat{A}_2). \nonumber
\end{align}
This proves the statement for $r = 2$. The general statement for $r > 2$ follows via induction. Assuming \eqref{eq:kron-to-outer-perm} holds for $r - 1$, the induction step is then;
\begin{multline*}
\vec{\bigkron_{k = r}^{1}}\mat{A}_k
= \vec\Bigl(\mat{A}_r\otimes\bigkron_{k = r - 1}^{1}\mat{A}_k\Bigr)
\overset{\eqref{eq:MagnusNeudecker1986-vec-kron-identity}}{=} \Bigl( \mat{I}_{q_r}\otimes\mat{K}_{\prod_{k = 1}^{r - 1}q_k, p_r}\otimes\mat{I}_{\prod_{k = 1}^{r - 1}p_k} \Bigr)\vec\Bigl((\vec\mat{A}_r)\otimes\vec\bigkron_{k = r - 1}^{1}\mat{A}_k\Bigr) \\
= \mat{S}_{\bigl( \prod_{k = 1}^{r - 1}p_k, p_r \bigr), \bigl( \prod_{k = 1}^{r - 1}q_k, q_r \bigr)}\vec\Bigl[\Bigl(\vec\bigkron_{k = r - 1}^{1}\mat{A}_k\Bigr)\t{(\vec\mat{A}_r)}\Bigr] \\
\overset{\eqref{eq:kron-to-outer-perm}}{=} \mat{S}_{\bigl( \prod_{k = 1}^{r - 1}p_k, p_r \bigr), \bigl( \prod_{k = 1}^{r - 1}q_k, q_r \bigr)}\vec\Bigl[\mat{S}_{(p_1, \ldots, p_{r-1}), (q_1, \ldots, q_{r-1})}\Bigl(\vec\bigouter_{k = 1}^{r - 1}\mat{A}_k\Bigr)\t{(\vec\mat{A}_r)}\Bigr] \\
\overset{(a)}{=} \mat{S}_{\bigl( \prod_{k = 1}^{r - 1}p_k, p_r \bigr), \bigl( \prod_{k = 1}^{r - 1}q_k, q_r \bigr)} \bigl(\mat{I}_{p_r q_r}\otimes\mat{S}_{(p_1, \ldots, p_{r-1}), (q_1, \ldots, q_{r-1})}\bigr)\vec\Bigl[\Bigl(\vec\bigouter_{k = 1}^{r - 1}\mat{A}_k\Bigr)\t{(\vec\mat{A}_r)}\Bigr] \\
= \mat{S}_{\mat{p}, \mat{q}}\vec\bigouter_{k = 1}^{r}\mat{A}_k.
\end{multline*}
Equality $(a)$ uses the relation $\vec(\mat{C}\mat{a}\t{\mat{b}}) = (\mat{I}_{\dim(\mat{b})}\otimes\mat{C})\vec(\mat{a}\t{\mat{b}})$ for a matrix $\mat{C}$ and vectors $\mat{a}, \mat{b}$.
\end{proof}
% \begin{remark}
% \todo{simplification of $\mat{S}$ if all dimensions are equal, that is if $p_k = p_j$ and $p_k = q_k$ for all $k, j$?!}
% \end{remark}
\begin{remark}
The permutation matrix $\mat{K}_{p, q}$ represents a perfect outer $p$-shuffle of $p q$ elements.
\end{remark}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Proofs}\label{app:B}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{proof}[Proof of \cref{thm:sdr}]\label{proof:sdr}
A direct implication of \textcite[Theorem~1]{sdr-BuraDuarteForzani2016} is that, under the exponential family \eqref{eq:quadratic-exp-fam} with natural statistic \eqref{eq:t-stat},
\begin{displaymath}
\t{\mat{\alpha}}(\mat{t}(\ten{X}) - \E\mat{t}(\ten{X}))
\end{displaymath}
is a sufficient reduction, where $\mat{\alpha}\in\mathbb{R}^{(p + d)\times q}$ with $\Span(\mat{\alpha}) = \Span(\{\mat{\eta}_y - \E_{Y}\mat{\eta}_Y : y\in\mathcal{S}_Y\})$. Since $\E_Y\ten{F}_Y = 0$, $\E_Y\mat{\eta}_{1 Y} = \E[\vec\overline{\ten{\eta}} - \mat{B}\vec\ten{F}_Y] = \vec\overline{\ten{\eta}}$. Thus,
\begin{displaymath}
\mat{\eta}_y - \E_{Y}\mat{\eta}_Y = \begin{pmatrix}
\mat{\eta}_{1 y} - \E_{Y}\mat{\eta}_{1 Y} \\
\mat{\eta}_{2} - \E_{Y}\mat{\eta}_{2}
\end{pmatrix} = \begin{pmatrix}
\mat{B}\vec\ten{F}_y \\
\mat{0}
\end{pmatrix}.
\end{displaymath}
as $\mat{\eta}_{2}$ does not depend on $y$.
The set $\{ \vec{\ten{F}_y} : y\in\mathcal{S}_y \}$ is a subset of $\mathbb{R}^q$. Therefore,
\begin{displaymath}
\Span\left(\{\mat{\eta}_y - \E_{Y}\mat{\eta}_Y : y\in\mathcal{S}_Y\}\right) = \Span\left(\left\{\begin{pmatrix}
\mat{B}\vec\ten{F}_Y \\ \mat{0}
\end{pmatrix} : y\in\mathcal{S}_Y \right\}\right)
\subseteq
\Span\begin{pmatrix}
\mat{B} \\ \mat{0}
\end{pmatrix},
\end{displaymath}
which obtains that
\begin{displaymath}
\t{\begin{pmatrix}
\mat{B} \\ \mat{0}
\end{pmatrix}}(\mat{t}(\ten{X}) - \E\mat{t}(\ten{X}))
=
\t{\mat{B}}\vec(\ten{X} - \E\ten{X})
= \vec\Bigl(\ten{F}_y\mlm_{k = 1}^{r}\mat{\beta}_k\Bigr)
\end{displaymath}
is also a sufficient reduction, though not necessarily minimal, using $\mat{B} = \bigkron_{k = 1}^{r}\mat{\beta}_k$. When the exponential family is full rank, which in our setting amounts to all $\mat{\beta}_j$ being full rank matrices, $j=1,\ldots,r$, then \textcite[Thm~1]{sdr-BuraDuarteForzani2016} also obtains the minimality of the reduction.
\end{proof}
\todo{check proof of Thm 2}
\begin{proof}[Proof of \cref{thm:grad}]\label{proof:grad}
We first note that for any exponential family with density \eqref{eq:quad-density} the term $b(\mat{\eta}_{y_i})$ differentiated with respect to the natural parameter $\mat{\eta}_{y_i}$ is the expectation of the statistic $\mat{t}(\ten{X})$ given $Y = y_i$. In our case we get $\nabla_{\mat{\eta}_{y_i}}b = (\nabla_{\mat{\eta}_{1{y_i}}}b, \nabla_{\mat{\eta}_2}b)$ with components
\begin{displaymath}
\nabla_{\mat{\eta}_{1{y_i}}}b
= \E[\mat{t}_1(\ten{X})\mid Y = y_i]
= \vec\E[\ten{X}\mid Y = y_i]
= \vec\ten{g}_1(\mat{\eta}_{y_i})
\end{displaymath}
and
\begin{align*}
\nabla_{\mat{\eta}_{2}}b
&= \E[\mat{t}_2(\ten{X})\mid Y = y_i]
= \E[\mat{T}_2\vech((\vec\ten{X})\t{(\vec\ten{X})})\mid Y = y_i] \\
&= \E[\mat{T}_2\pinv{\mat{D}_p}\vec(\ten{X}\circ\ten{X})\mid Y = y_i]
= \mat{T}_2\pinv{\mat{D}_p}\vec\ten{g}_2(\mat{\eta}_{y_i}).
\end{align*}
The gradients are related to their derivatives by transposition, $\nabla_{\mat{\eta}_{1{y_i}}}b = \t{\D b(\mat{\eta}_{1{y_i}})}$ and $\nabla_{\mat{\eta}_2}b = \t{\D b(\mat{\eta}_2)}$.
Next we provide the differentials of the natural parameter components from \eqref{eq:eta1} and \eqref{eq:eta2} in a quite direct form ,without any further ``simplifications'', because the down-stream computations won't benefit from reexpressing the following
\todo{in terms of $m_{\mat{\theta}}$, meaning without the sum, makes it a bit nicer!}
\begin{align*}
\d\mat{\eta}_{1{y_i}}(\overline{\ten{\eta}})
&= \d\vec{\overline{\ten{\eta}}}, \\
\d\mat{\eta}_{1{y_i}}(\mat{\beta}_j)
&= \vec\Bigl( \ten{F}_{y_i}\mlm_{\substack{k = 1\\k\neq j}}^{r}\mat{\beta}_k\times_j\d\mat{\beta}_j \Bigr), \\
\d\mat{\eta}_2(\mat{\Omega}_j)
&= \t{(\pinv{(\mat{T}_2\pinv{\mat{D}_p})})}\vec(c\,\d\mat{\Omega}) \\
&= c\t{(\pinv{(\mat{T}_2\pinv{\mat{D}_p})})}\vec\Bigl(\,\bigkron_{k = r}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_j\otimes\bigkron_{l = j - 1}^{1}\mat{\Omega}_l \Bigr).
\end{align*}
All other combinations, namely $\d\mat{\eta}_{1{y_i}}(\mat{\Omega}_j)$, $\d\mat{\eta}_2(\overline{\ten{\eta}})$ and $\d\mat{\eta}_2(\mat{\beta}_j)$, are zero.
Continuing with the partial differentials of $l_n$ from \eqref{eq:log-likelihood}
\begin{multline*}
\d l_n(\overline{\ten{\eta}})
= \sum_{i = 1}^{n} (\langle \d\overline{\ten{\eta}}, \ten{X}_i \rangle - \D b(\mat{\eta}_{1{y_i}})\d\mat{\eta}_{1{y_i}}(\overline{\ten{\eta}}))
= \sum_{i = 1}^{n} \t{(\vec{\ten{X}_i} - \vec\ten{g}_1(\mat{\eta}_{y_i}))}\d\vec{\overline{\ten{\eta}}} \\
= \t{(\d\vec{\overline{\ten{\eta}}})}\vec\sum_{i = 1}^{n} (\ten{X}_i - \ten{g}_1(\mat{\eta}_{y_i})).
\end{multline*}
For every $j = 1, ..., r$ we get the differentials
\begin{multline*}
\d l_n(\mat{\beta}_j)
= \sum_{i = 1}^{n} \biggl(\Bigl\langle \ten{F}_{y_i}\mlm_{\substack{k = 1\\k\neq j}}^{r}\mat{\beta}_k\times_j\d\mat{\beta}_j, \ten{X}_i \Bigr\rangle - \D b(\mat{\eta}_{1{y_i}})\d\mat{\eta}_{1{y_i}}(\mat{\beta}_j)\biggr)
= \sum_{i = 1}^{n} \Bigl\langle \ten{F}_{y_i}\mlm_{\substack{k = 1\\k\neq j}}^{r}\mat{\beta}_k\times_j\d\mat{\beta}_j, \ten{X}_i - \ten{g}_1(\mat{\eta}_{y_i}) \Bigr\rangle \\
= \sum_{i = 1}^{n} \tr\biggl( \d\mat{\beta}_j\Bigl(\ten{F}_{y_i}\mlm_{\substack{k = 1\\k\neq j}}^{r}\mat{\beta}_k\Bigr)_{(j)} \t{(\ten{X}_i - \ten{g}_1(\mat{\eta}_{y_i}))_{(j)}} \biggr)
= \t{(\d\vec{\mat{\beta}_j})}\vec\sum_{i = 1}^{n} (\ten{X}_i - \ten{g}_1(\mat{\eta}_{y_i}))_{(j)} \t{\Bigl(\ten{F}_{y_i}\mlm_{\substack{k = 1\\k\neq j}}^{r}\mat{\beta}_k\Bigr)_{(j)}}
\end{multline*}
as well as
\begin{multline*}
\d l_n(\mat{\Omega}_j)
= \sum_{i = 1}^{n} \biggl( c\Bigl\langle \ten{X}_i\mlm_{\substack{k = 1\\k\neq j}}^{r}\mat{\Omega}_k\times_j\d\mat{\Omega}_j, \ten{X}_i \Bigr\rangle - \D b(\mat{\eta}_2)\d\mat{\eta}_2(\mat{\Omega}_j) \biggr) \\
= c\sum_{i = 1}^{n} \biggl( \Bigl\langle \ten{X}_i\mlm_{\substack{k = 1\\k\neq j}}^{r}\mat{\Omega}_k\times_j\d\mat{\Omega}_j, \ten{X}_i \Bigr\rangle - \t{(\mat{T}_2\pinv{\mat{D}_p}\vec\ten{g}_2(\mat{\eta}_{y_i}))}\t{(\pinv{(\mat{T}_2\pinv{\mat{D}_p})})}\vec\Bigl(\,\bigkron_{k = r}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_j\otimes\bigkron_{l = j - 1}^{1}\mat{\Omega}_l \Bigr) \biggr) \\
= c\sum_{i = 1}^{n} \biggl( \Bigl\langle \ten{X}_i\mlm_{\substack{k = 1\\k\neq j}}^{r}\mat{\Omega}_k\times_j\d\mat{\Omega}_j, \ten{X}_i \Bigr\rangle - \t{(\vec\ten{G}_2(\mat{\eta}_{y_i}))}\vec\Bigl(\,\bigkron_{k = r}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_j\otimes\bigkron_{l = j - 1}^{1}\mat{\Omega}_l \Bigr) \biggr) \\
= c\sum_{i = 1}^{n} \biggl( \t{\vec(\ten{X}_i\circ\ten{X}_i - \ten{G}_2(\mat{\eta}_{y_i}))}\vec\Bigl(\,\bigkron_{k = r}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_j\otimes\bigkron_{l = j - 1}^{1}\mat{\Omega}_l \Bigr) \biggr) \\
= c\sum_{i = 1}^{n} \K(\ten{X}_i\circ\ten{X}_i - \ten{G}_2(\mat{\eta}_{y_i}))\mlm_{\substack{k = 1\\k\neq j}}^{r}\t{(\vec{\mat{\Omega}_k})}\times_j\t{(\d\vec{\mat{\Omega}_j})} \\
= c\t{(\d\vec{\mat{\Omega}_j})}\sum_{i = 1}^{n} \Bigl((\ten{X}_i\otimes\ten{X}_i - \K(\ten{G}_2(\mat{\eta}_{y_i})))\mlm_{\substack{k = 1\\k\neq j}}^{r}\t{(\vec{\mat{\Omega}_k})}\Bigr)_{(j)} \\
= c\t{(\d\vec{\mat{\Omega}_j})}\vec\sum_{i = 1}^{n} (\ten{X}_i\otimes\ten{X}_i - \K(\ten{G}_2(\mat{\eta}_{y_i})))\mlm_{\substack{k = 1\\k\neq j}}^{r}\t{(\vec{\mat{\Omega}_k})}
\end{multline*}
Now, applying the identity $\d \ten{A}(\ten{B}) = \t{(\d\vec{\ten{B}})}\nabla_{\ten{B}}\ten{A}$ gives the required partial gradients.
Finally, if $\mat{T}_2$ is the identify matrix, then
\begin{displaymath}
\vec{\ten{G}_2(\mat{\eta}_y)} = \pinv{(\mat{T}_2\pinv{\mat{D}_p})}\mat{T}_2\pinv{\mat{D}_p}\vec{\ten{g}_2(\mat{\eta}_y)}
= \mat{D}_p\pinv{\mat{D}_p}\vec{\ten{g}_2(\mat{\eta}_y)}
= \vec{\ten{g}_2(\mat{\eta}_y)}
\end{displaymath}
where the last equality holds because $\mat{N}_p = \mat{D}_p\pinv{\mat{D}_p}$ is the symmetrizer matrix from \textcite[Ch. 11]{MatrixAlgebra-AbadirMagnus2005}. For the symmetrizer matrix $\mat{N}_p$ holds $\mat{N}_p\vec{\mat{A}} = \vec{\mat{A}}$ if $\mat{A} = \t{\mat{A}}$, while
\begin{displaymath}
\vec{\ten{g}_2(\mat{\eta}_y)} = \vec\E[\ten{X}\circ\ten{X}\mid Y = y] = \vec\E[(\vec{\ten{X}})\t{(\vec{\ten{X}})}\mid Y = y]
\end{displaymath}
is the vectorization of a symmetric matrix.
\end{proof}
\begin{proof}[Proof of \cref{thm:kron-manifolds}]\label{proof:kron-manifolds}
We start by considering the first case and assume that $\manifold{B}$ is spherical with radius $1$ w.l.o.g. We equip $\manifold{K} = \{ \mat{A}\otimes \mat{B} : \mat{A}\in\manifold{A}, \mat{B}\in\manifold{B} \}\subset\mathbb{R}^{p_1 p_2\times q_1 q_2}$ with the subspace topology \efi{cite{?}}. Define the hemispheres $H_i^{+} = \{ \mat{B}\in\manifold{B} : (\vec{\mat{B}})_i > 0 \}$ and $H_i^{-} = \{ \mat{B}\in\manifold{B} : (\vec{\mat{B}})_i < 0 \}$ for $i = 1, ..., p_2 q_2$. The hemispheres are an open cover of $\manifold{B}$ with respect to the subspace topology. Define for every $H_i^{\pm}$, where $\pm$ is a placeholder for ether $+$ or $-$, the function
\begin{displaymath}
f_{H_i^{\pm}} : \manifold{A}\times H_i^{\pm}\to\mathbb{R}^{p_1 p_2\times q_1 q_2}
: (\mat{A}, \mat{B})\mapsto \mat{A}\otimes \mat{B}
\end{displaymath}
which is smooth. With the spherical property of $\manifold{B}$ the relation $\|\mat{A}\otimes \mat{B}\|_F = \|\mat{A}\|_F$ for all $\mat{A}\otimes \mat{B}\in\manifold{K}$ ensures that the function $f_{H_i^{\pm}}$, constrained to its image, is bijective with inverse function (identifying $\mathbb{R}^{p\times q}$ with $\mathbb{R}^{p q}$) given by
\begin{displaymath}
f_{H_i^{\pm}}^{-1} : f_{H_i^{\pm}}(\manifold{A}\times H_i^{\pm})\to\manifold{A}\times H_i^{\pm}
: \mat{C}\mapsto \left(\pm\frac{\|\mat{C}\|_F}{\|\mat{R}(\mat{C})\mat{e}_i\|_2}\mat{R}(\mat{C})\mat{e}_i, \pm\frac{1}{\|\mat{C}\|_F\|\mat{R}(\mat{C})\mat{e}_i\|_2}\mat{R}(\mat{C})\t{\mat{R}(\mat{C})}\mat{e}_i\right)
\end{displaymath}
where $\pm$ is $+$ for a ``positive'' hemisphere $H_i^{+}$ and $-$ otherwise, $\mat{e}_i\in\mathbb{R}^{p_2 q_2}$ is the $i$th unit vector and $\mat{R}(\mat{C})$ is a ``reshaping'' permutation \footnote{Relating to $\K$ the operation $\mat{R}$ is basically its inverse as $\K(\mat{A}\circ\mat{B}) = \mat{A}\otimes\mat{B}$ with a mismatch in the shapes only.} which acts on Kronecker products as $\mat{R}(\mat{A}\otimes \mat{B}) = (\vec{\mat{A}})\t{(\vec{\mat{B}})}$. This makes $f_{H_i^{\pm}}^{-1}$ a combination of smooth functions ($\mat{0}$ is excluded from $\manifold{A}, \manifold{B}$ guarding against division by zero) and as such it is also smooth. This ensures that $f_{H_i^{\pm}} : \manifold{A}\times {H_i^{\pm}}\to f_{H_i^{\pm}}(\manifold{A}\times {H_i^{\pm}})$ is a diffeomorphism.
Next, we construct an atlas\footnote{A collection of charts $\{ \varphi_i : i\in I \}$ with index set $I$ of a manifold $\manifold{A}$ is called an \emph{atlas} if the pre-images of the charts $\varphi_i$ cover the entire manifold $\manifold{A}$.} for $\manifold{K}$ which is equipped with the subspace topology. Let $(\varphi_j, U_j)_{j\in J}$ be a atlas of $\manifold{A}\times\manifold{B}$. Such an atlas exists and admits a unique smooth structure as both $\manifold{A}, \manifold{B}$ are embedded manifolds from which we take the product manifold. The images of the coordinate domains $f_H(U_j)$ are open in $\manifold{K}$, since $f_H$ is a diffeomorphism, with the corresponding coordinate maps
\begin{displaymath}
\phi_{H_i^{\pm},j} : f_{H_i^{\pm}}(U_j)\to \varphi_j(U_j)
: \mat{C}\mapsto \varphi_j(f_{H_i^{\pm}}^{-1}(\mat{C})).
\end{displaymath}
By construction the set $\{ \phi_{H_i^{\pm},j} : i = 1, ..., p_2 q_2, \pm\in\{+, -\}, j\in J \}$ is an atlas if the charts are compatible. This means we need to check if the transition maps are diffeomorphisms, let $(\phi_{H, j}, V_j), (\phi_{\widetilde{H}, k}, V_k)$ be two arbitrary charts from our atlas, then the transition map $\phi_{\widetilde{H}, k}\circ\phi_{H,j}^{-1}:\phi_{H,j}^{-1}(V_j\cap V_k)\to\phi_{\widetilde{H},k}^{-1}(V_j\cap V_k)$ has the form
\begin{displaymath}
\phi_{\widetilde{H}, k}\circ\phi_{H,j}^{-1}
= \varphi_k\circ f_{\widetilde{H}}^{-1}\circ f_{H}\circ\varphi_{j}^{-1}
= \varphi_k\circ (\pm\mathrm{id})\circ\varphi_{j}^{-1}
\end{displaymath}
where $\pm$ depends on $H, \widetilde{H}$ being of the same ``sign'' and $\mathrm{id}$ is the identity. We conclude that the charts are compatible, which makes the constructed set of charts an atlas. With that we have shown the topological manifold $\manifold{K}$ with the subspace topology admit a smooth atlas that makes it an embedded smooth manifold with dimension equal to the dimension of the product topology $\manifold{A}\times\manifold{B}$; that is, $d = \dim\manifold{A} + \dim\manifold{B}$.
It remains to show that the cone condition also admits a smooth manifold. $\manifold{K} = \{ \mat{A}\otimes \mat{B} : \mat{A}\in\manifold{A}, \mat{B}\in\widetilde{B} \}$, where $\widetilde{B} = \{ \mat{B}\in\manifold{B} : \|\mat{B}\|_F = 1 \}$, holds if both $\manifold{A}, \manifold{B}$ are cones. Since $g:\manifold{B}\to\mathbb{R}:\mat{B}\mapsto \|\mat{B}\|_F$ is continuous on $\manifold{B}$ with full rank $1$ everywhere, $\widetilde{\manifold{B}} = g^{-1}(1)$ is a $\dim{\manifold{B}} - 1$ dimensional embedded submanifold of $\manifold{B}$. An application of the spherical case proves the cone case.
\end{proof}
\begin{proof}[Proof of \cref{thm:param-manifold}]
An application of \cref{thm:kron-manifold-tangent-space} ensures that $\manifold{K}_{\mat{B}}$ and $\manifold{K}_{\mat{\Omega}}$ are embedded submanifolds.
With $\mat{T}_2$ being a $d\times p(p + 1) / 2$ full rank matrix and the duplication matrix $\mat{D}_p$ is full rank of dimension $p(p + 1) / 2 \times p^2$ we have $\mat{T}_2\pinv{\mat{D}_p}$ to be $d\times p^2$ of full rank. This means that $\mat{P} = \pinv{(\mat{T}_2\pinv{\mat{D}_p})}\mat{T}_2\pinv{\mat{D}_p}$ is a $p^2\times p^2$ projection of rank $d$ and $\mat{I}_{p^2} - \mat{P}$ is then a projection of rank $p^2 - d$. This leads to
\begin{displaymath}
\manifold{CK}_{\mat{\Omega}}
= \{ \mat{\Omega}\in\manifold{K}_{\mat{\Omega}} : (\mat{I}_{p^2} - \mat{P})\vec{\mat{\Omega}} = \mat{0} \}.
\end{displaymath}
To show that $\manifold{CK}_{\mat{\Omega}}$ is an embedded submanifold of $\manifold{K}_{\mat{\Omega}}$ we apply the ``Constant-Rank Level Set Theorem'' \textcite[Thm~5.12]{introToSmoothMani-Lee2012} which states (slightly adapted) the following;
Let $\manifold{A}$, $\manifold{B}$ be smooth manifolds and $F:\manifold{A}\to\manifold{B}$ a smooth map such that $\nabla_{\mat{a}} F$ has constant matrix rank for all $\mat{a}\in\manifold{A}$. Then, for every $\mat{b}\in F(\mat{A})\subseteq\manifold{B}$, the preimage $F^{-1}(\{ \mat{b} \})$ is a smooth embedded submanifold of $\manifold{A}$.
In our setting, we have $F:\manifold{K}_{\mat{\Omega}}\to\mathbb{R}^{p^2}$ defined as $F(\mat{\Omega}) = (\mat{I}_{p^2} - \mat{P})\vec{\mat{\Omega}}$ with gradient $\nabla_{\mat{\Omega}} F = \mat{I}_{p^2} - \mat{P}$ of constant rank. Therefore, $F^{-1}(\{\mat{0}\}) = \manifold{CK}_{\mat{\Omega}}$ is an embedded submanifold of $\manifold{K}_{\mat{\Omega}}$.
Finally, the finite product manifold of embedded submanifolds is embedded in the finite product space of their ambient spaces, that is $\Theta = \mathbb{R}^p \times \manifold{K}_{\mat{B}}\times\manifold{CK}_{\mat{\Omega}} \subset \mathbb{R}^p\times\mathbb{R}^{p\times q}\times\mathbb{R}^{p\times p}$ is embedded.
\end{proof}
\begin{proof}[Proof of \cref{thm:exists-strong-M-estimator-on-subsets}]
Let $\hat{\mat{\xi}}_n$ be a (weak/strong) M-estimator for the unconstrained problem. This gives by definition, in any case, that
\begin{displaymath}
\sup_{\mat{\xi}\in\Xi} M_n(\mat{\xi}) \leq M_n(\hat{\mat{\xi}}_n) + o_P(1).
\end{displaymath}
Cause $\emptyset\neq\Theta\subseteq\Xi$ we have $\sup_{\mat{\theta}\in\Theta} M_n(\mat{\theta}) \leq \sup_{\mat{\xi}\in\Xi} M_n(\mat{\xi})$ and with $M_n(\mat{\xi}) < \infty$ for any $\mat{\xi}\in\Xi$
\begin{displaymath}
P\Bigl( \sup_{\mat{\theta}\in\Theta} M_n(\mat{\theta}) < \infty \Bigr)
\geq
P\Bigl( \sup_{\mat{\xi}\in\Xi} M_n(\mat{\xi}) < \infty \Bigr)
\xrightarrow{n\to\infty}
1.
\end{displaymath}
If $\sup_{\mat{\theta}\in\Theta} M_n(\mat{\theta}) < \infty$, then, for any $0 < \epsilon_n$ exists $\hat{\mat{\theta}}_n\in\Theta$ such that $\sup_{\mat{\theta}\in\Theta} M_n(\mat{\theta}) - \epsilon_n \leq M_n(\hat{\mat{\theta}}_n)$. Therefore, we can choose $\epsilon_n\in o(n^{-1})$, which yields
\begin{displaymath}
P\Bigl( M_n(\hat{\mat{\theta}}_n) \geq \sup_{\mat{\theta}\in\Theta} M_n(\mat{\theta}) - o(n^{-1}) \Bigr)
\geq
P\Bigl( \sup_{\mat{\theta}\in\Theta} M_n(\mat{\theta}) < \infty \Bigr)
\xrightarrow{n\to\infty}
1.
\end{displaymath}
The last statement states
\begin{displaymath}
M_n(\hat{\mat{\theta}}_n) \geq \sup_{\mat{\theta}\in\Theta} M_n(\mat{\theta}) - o_P(n^{-1})
\end{displaymath}
which is the definition of $\hat{\mat{\theta}}_n$ being a strong M-estimator over $\Theta$.
\end{proof}
\begin{proof}[Proof of \cref{thm:M-estimator-consistency-on-subsets}]
It follows the proof of \textcite[Proposition~2.4]{asymptoticMLE-BuraEtAl2018} with the same assumptions. The only exception is we only require $\Theta$ to be a subset of $\Xi$. This is accounted for by replacing Lemma~2.3 in \textcite{asymptoticMLE-BuraEtAl2018} with \cref{thm:exists-strong-M-estimator-on-subsets} to obtain the existence of a strong M-estimator on $\Theta$.
\end{proof}
\begin{proof}[Proof of \cref{thm:M-estimator-asym-normal-on-manifolds}]
Let $\varphi:U\to\varphi(U)$ be a coordinate chart\footnote{By \cref{def:manifold}, the chart $\varphi : U\to\varphi(U)$ is bi-continuous, is infinitely often continuously differentiable, and has a continuously differentiable inverse $\varphi^{-1} : \varphi(U) \to U$. Furthermore, the domain $U$ is open according to the trace topology on $\Theta$, that means that their exists an open set $O\subseteq\mathbb{R}^p$ such that $U = \Theta\cap O$.} with $\mat{\theta}_0\in U\subseteq\Theta$. As $\varphi$ is continuous we get with the continuous mapping theorem on metric spaces \textcite[Thm~18.11]{asymStats-van_der_Vaart1998} that $\varphi(\hat{\mat{\theta}}_n)\xrightarrow{p}\varphi(\mat{\theta}_0)$ which implies $P(\varphi(\hat{\mat{\theta}}_n)\in\varphi(U))\xrightarrow{n\to\infty}1$.
The next step is to apply \textcite[Thm~5.23]{asymStats-van_der_Vaart1998} to $\hat{\mat{s}}_n = \varphi(\hat{\mat{\theta}}_n)$. Therefore, assume that $\hat{\mat{s}}_n\in\varphi(U)$. Denote with $\mat{s} = \varphi(\mat{\theta})\in\varphi(U)\subseteq\mathbb{R}^d$ the coordinates of the parameter $\mat{\theta}\in U\subseteq\Theta$ of the $d = \dim(\Theta)$ dimensional manifold $\Theta\subseteq\mathbb{R}^p$. With $\varphi : U\to\varphi(U)$ being bijective, we can express $m_{\mat{\theta}}$ in terms of $\mat{s} = \varphi(\mat{\theta})$ for every $\mat{\theta}\in U$ as $m_{\mat{\theta}} = m_{\varphi^{-1}(\mat{s})}$. Furthermore, denote
\begin{displaymath}
M(\mat{\theta}) = \E[m_{\mat{\theta}}(Z)] \qquad\text{and}\qquad M_{\varphi}(\mat{s}) = \E[m_{\varphi^{-1}(\mat{s})}(Z)] = M(\varphi^{-1}(\mat{s})).
\end{displaymath}
\begin{figure}[hpt!]
\centering
\includegraphics{images/embeddImage.pdf}
\caption{\label{fig:proof:M-estimator-asym-normal-on-manifolds}Depiction ot the notation used in the proof of \cref{thm:M-estimator-asym-normal-on-manifolds}. Example with $p = 3$ and $d = \dim(\Theta) = 2$.}
\end{figure}
By assumption, the function $M(\mat{\theta})$ is twice continuously differentiable in an neighborhood\footnote{A set $N$ is called a neighborhood of $u$ if there exists an open set $O$ such that $u\in O\subseteq N$.} of $\mat{\theta}_0$. W.l.o.g. we can assume that $U$ is contained in that neighborhood. Then, using the chain rule, we get the gradient of $M_{\varphi}$ at $\mat{s}_0$ to be $\mat{0}$ by
\begin{displaymath}
\nabla M_{\varphi}(\mat{s}_0) = {\nabla\varphi^{-1}(\mat{s}_0)}{\nabla M(\varphi^{-1}(\mat{s}_0))} = {\nabla\varphi^{-1}(\mat{s}_0)}{\nabla M(\mat{\theta}_0)} = {\nabla\varphi^{-1}(\mat{s}_0)}\mat{0} = \mat{0}
\end{displaymath}
because $\mat{\theta}_0 = \varphi^{-1}(\mat{s}_0)$ is a maximizer of $M$. For the second-derivative, evaluated at $\mat{s}_0 = \varphi(\mat{\theta}_0)$, we have
\begin{displaymath}
\nabla^2 M_{\varphi}(\mat{s}_0)
= \nabla\varphi^{-1}(\mat{s}_0)\nabla^2 M(\varphi^{-1}(\mat{s}_0))\t{\nabla\varphi^{-1}(\mat{s}_0)}
= \nabla\varphi^{-1}(\mat{s}_0)\mat{H}_{\mat{\theta}_0}\t{\nabla\varphi^{-1}(\mat{s}_0)}
\end{displaymath}
using $\nabla M_{\varphi}(\mat{s}_0) = \mat{0}$. This gives the second-order Taylor expansion of $M_{\varphi}$ at $\mat{s}_0$ as
\begin{displaymath}
M_{\varphi}(\mat{s}) = M_{\varphi}(\mat{s}_0) + \frac{1}{2}\t{(\mat{s} - \mat{s}_0)} \nabla^2 M_{\varphi}(\mat{s}_0) (\mat{s} - \mat{s}_0) + \mathcal{O}(\|\mat{s} - \mat{s}_0\|^3)
\end{displaymath}
We also need to check the local Lipschitz condition of $m_{\varphi^{-1}(\mat{s})}$. Therefore, let $V_{\epsilon}(\mat{s}_0) = \{ \mat{s}\in\mathbb{R}^d : \|\mat{s} - \mat{s}_0\| < \epsilon \}$ be the open $\epsilon$-ball with center $\mat{s}_0$. Since $\varphi(U)$ contains $\mat{s}_0$, and is open in $\mathbb{R}^d$, there exists an $\epsilon > 0$ such that $V_{\epsilon}(\mat{s}_0)\subseteq\varphi(U)$. Then, the closed $\epsilon/2$ ball $\overline{V}_{\epsilon / 2}(\mat{s}_0)$ is a neighborhood of $\mat{s}_0$ and the supremum $\sup_{\mat{s}\in \overline{V}_{\epsilon / 2}(\mat{s}_0)}\|\nabla\varphi^{-1}(\mat{s})\| < \infty$ due to the continuouty of $\nabla\varphi^{-1}$ on $\varphi(U)$ with $\overline{V}_{\epsilon / 2}(\mat{s}_0)\subset V_{\epsilon}(\mat{s}_0)\subseteq\varphi(U)$. Then, for almost every $z$ and every $\mat{s}_1 = \varphi(\mat{\theta}_1), \mat{s}_2 = \varphi(\mat{\theta}_2)\in\overline{V}_{\epsilon / 2}(\mat{s}_0)$ holds
\begin{multline*}
| m_{\varphi^{-1}(\mat{s}_1)}(z) - m_{\varphi^{-1}(\mat{s}_2)}(z) |
= | m_{\mat{\theta}_1}(z) - m_{\mat{\theta}_2}(z) |
\overset{(a)}{\leq} u(z) \| \mat{\theta}_1 - \mat{\theta}_2 \|
= u(z) \| \varphi^{-1}(\mat{s}_1) - \varphi^{-1}(\mat{s}_2) \| \\
\overset{(b)}{\leq} u(z) \sup_{\mat{s}\in \overline{V}_{\epsilon / 2}(\mat{s}_0)}\|\nabla\varphi^{-1}(\mat{s})\| \|\mat{s}_1 - \mat{s}_2\|
=: v(z) \|\mat{s}_1 - \mat{s}_2\|.
\end{multline*}
Here, $(a)$ holds by assumption and $(b)$ is a result of the mean value theorem. Now, $v(z)$ is measurable and square integrable as a scaled version of $u(z)$. Finally, with $\varphi$ being one-to-one, we get that $\hat{\mat{s}}_n = \varphi(\hat{\mat{\theta}}_n)$ is a strong M-estimator for $\mat{s}_0 = \varphi(\mat{\theta}_0)$ of the objective $M_{\varphi}$. Now, we apply \textcite[Thm~5.23]{asymStats-van_der_Vaart1998} to get the asymptotic normality of $\hat{\mat{s}}_n$ as
\begin{displaymath}
\sqrt{n}(\hat{\mat{s}}_n - \mat{s}_0) \xrightarrow{d} \mathcal{N}_{d}(0, \mat{\Sigma}_{\mat{s}_0})
\end{displaymath}
where the $d\times d$ variance-covariance matrix $\mat{\Sigma}_{\mat{s}_0}$ is given by
\begin{align*}
\mat{\Sigma}_{\mat{s}_0} &= (\nabla^2 M_{\varphi}(\mat{s}_0))^{-1}\E[\nabla_{\mat{s}} m_{\varphi^{-1}(\mat{s}_0)}(Z)\t{(\nabla_{\mat{s}} m_{\varphi^{-1}(\mat{s}_0)}(Z))}](\nabla^2 M_{\varphi}(\mat{s}_0))^{-1}.
\end{align*}
{
\def\PP{\mat{\varPhi}_{\mat{\theta}_0}}
\def\EE#1#2{\E[\nabla_{#2} m_{#1}(Z)\t{(\nabla_{#2} m_{#1}(Z))}]}
An application of the delta method yields
\begin{displaymath}
\sqrt{n}(\hat{\mat{\theta}}_n - \mat{\theta}_0)
= \sqrt{n}(\varphi^{-1}(\hat{\mat{s}}_n) - \varphi^{-1}(\mat{s}_0))
\xrightarrow{d} \mathcal{N}_p(0, \t{\nabla\varphi^{-1}(\mat{s}_0)}\mat{\Sigma}_{\mat{s}_0}{\nabla\varphi^{-1}(\mat{s}_0)}).
\end{displaymath}
We continue by reexpressing the $p\times p$ asymtotic variance-covariance matrix of $\hat{\mat{\theta}}_n$ in terms of $\mat{\theta}_0$ instead of $\mat{s}_0 = \varphi(\mat{\theta}_0)$. Therefore, let $\PP = \t{\nabla\varphi^{-1}(\varphi(\mat{\theta}_0))} = \t{\nabla\varphi^{-1}(\mat{s}_0)}$ and observe that for all $\mat{s}\in\varphi(U)$, the gradient of $\mat{s}\mapsto m_{\varphi^{-1}(\mat{s})}(z)$ evaluated at $\mat{s}_0 = \varphi(\mat{\theta}_0)$ has the form
\begin{displaymath}
\nabla_{\mat{s}}m_{\varphi^{-1}(\mat{s}_0)}(z)
= \nabla\varphi^{-1}(\mat{s}_0)\nabla_{\mat{\theta}}m_{\mat{\theta}_0}(z)
= \t{\PP}\nabla_{\mat{\theta}}m_{\mat{\theta}_0}(z).
\end{displaymath}
Then
\begin{multline*}
\t{\nabla\varphi^{-1}(\mat{s}_0)}\mat{\Sigma}_{\mat{s}_0}{\nabla\varphi^{-1}(\mat{s}_0)} \\
= \t{\nabla\varphi^{-1}(\mat{s}_0)}(\nabla^2 M_{\varphi}(\mat{s}_0))^{-1}\EE{\varphi^{-1}(\mat{s}_0)}{}(\nabla^2 M_{\varphi}(\mat{s}_0))^{-1}{\nabla\varphi^{-1}(\mat{s}_0)} \\
= {\PP}(\t{\PP}\mat{H}_{\mat{\theta}_0}\PP)^{-1}\t{\PP}\EE{\mat{\theta}_0}{\mat{\theta}}{\PP}(\t{\PP}\mat{H}_{\mat{\theta}_0}\PP)^{-1}\t{\PP} \\
= \mat{\Pi}_{\mat{\theta}_0}\EE{\mat{\theta}_0}{\mat{\theta}}\mat{\Pi}_{\mat{\theta}_0}
\end{multline*}
where the last equality holds by $\Span\PP = T_{\mat{\theta}_0}\Theta$ by \cref{def:tangent-space} of the tangent space $T_{\mat{\theta}_0}\Theta$.
It remains to show that $\mat{\Pi}_{\mat{\theta}_0} = \mat{P}_{\mat{\theta}_0}\pinv{(\t{\mat{P}_{\mat{\theta}_0}}\mat{H}_{\mat{\theta}_0}\mat{P}_{\mat{\theta}_0})}\t{\mat{P}_{\mat{\theta}_0}}$ for any $p\times k$ matrix $\mat{P}_{\mat{\theta}_0}$ such that $k\geq d$ and $\Span{\mat{P}_{\mat{\theta}_0}} = T_{\mat{\theta}_0}\Theta$. This also ensures that the final result is independent of the chosen chart $\varphi$, since the tangent space does not depend on a specific chart. Therefore, let $\PP = {\mat{Q}}{\mat{R}}$ and $\mat{P}_{\mat{\theta}_0} = \widetilde{\mat{Q}}\widetilde{\mat{R}}$ be their thin QR decompositions, respectively. Both $\mat{Q}, \widetilde{\mat{Q}}$ have dimension $p\times d$ With $\mat{Q}$ being semi-orthogonal, $\mat{R}$ is invertible of dimension $d\times d$ while $\widetilde{\mat{R}}$ is a $d\times k$ full row-rank matrix. With $\mat{Q}$ being semi-orthogonal the $p\times p$ matrix $\mat{Q}\t{\mat{Q}}$ is an orthogonal projection onto $\Span\mat{Q} = \Span\mat{P}_{\mat{\theta}_0} = T_{\mat{\theta}_0}\Theta$. This allows to express $\mat{P}_{\mat{\theta}_0}$ in terms of $\mat{Q}$ as
\begin{displaymath}
\mat{P}_{\mat{\theta}_0} = \mat{Q}\t{\mat{Q}}\mat{P}_{\mat{\theta}_0}
= \mat{Q}\t{\mat{Q}}\widetilde{\mat{Q}}\widetilde{\mat{R}} =: {\mat{Q}}\mat{M}.
\end{displaymath}
From $\Span\mat{Q} = \Span\mat{P}_{\mat{\theta}_0}$ follows that the $d\times k$ matrix $\mat{M}$ is also of full row-rank. We get $\mat{M}\pinv{\mat{M}} = \mat{I}_d = \mat{R}\mat{R}^{-1}$ as a property of the Moore-Penrose pseudo inverse with $\mat{M}$ being of full row-rank. Another property of the pseudo inverse is that for matrices $\mat{A}, \mat{B}$, where $\mat{A}$ has full column-rank and $\mat{B}$ has full row-rank, holds $\pinv{(\mat{A}\mat{B})} = \pinv{\mat{B}}\pinv{\mat{A}}$. This enables the computation
\begin{multline*}
\mat{P}_{\mat{\theta}_0}\pinv{(\t{\mat{P}_{\mat{\theta}_0}}\mat{H}_{\mat{\theta}_0}\mat{P}_{\mat{\theta}_0})}\t{\mat{P}_{\mat{\theta}_0}}
= \mat{Q} \mat{M} \pinv{\mat{M}} (\t{\mat{Q}} \mat{H}_{\mat{\theta}_0} \mat{Q})^{-1} \t{(\mat{M} \pinv{\mat{M}})} \t{\mat{Q}} \\
= \mat{Q} {\mat{R}} {\mat{R}}^{-1} (\t{\mat{Q}} \mat{H}_{\mat{\theta}_0} \mat{Q})^{-1} \t{({\mat{R}} {\mat{R}}^{-1})} \t{\mat{Q}}
= \PP(\t{\PP}\mat{H}_{\mat{\theta}_0}\PP)^{-1}\t{\PP}
= \mat{\Pi}_{\mat{\theta}_0}.
\end{multline*}
}
\end{proof}
In the following we rewrite the log-likelihood \eqref{eq:log-likelihood} in a simpler form. This simplifies the proof of \cref{thm:asymptotic-normality-gmlm} as well as provides the notation to express the regularity conditions of \cref{thm:asymptotic-normality-gmlm} in a compact form.
Rewriting the first natural parameter component $\mat{\eta}_{1y}$ defined in \eqref{eq:eta1-manifold} gives
\begin{displaymath}
\mat{\eta}_{1y}
= \vec{\overline{\ten{\eta}}} + \mat{B}\vec{\ten{F}_y}
= \mat{I}_p\vec{\overline{\ten{\eta}}} + (\t{(\vec{\ten{F}_y})}\otimes\mat{I}_p)\vec{\mat{B}}
= \begin{pmatrix}
\mat{I}_p & \t{(\vec{\ten{F}_y})}\otimes\mat{I}_p
\end{pmatrix}\begin{pmatrix}
\vec{\overline{\ten{\eta}}} \\
\vec{\mat{B}}
\end{pmatrix}.
\end{displaymath}
For the second natural parameter component $\mat{\eta}_2$, modeled in \eqref{eq:eta2-manifold}, we have
\begin{displaymath}
\langle \mat{\eta}_2, \mat{T}_2\vech((\vec{\ten{X}})\t{(\vec{\ten{X}})}) \rangle
= \langle \t{(\mat{T}_2\pinv{\mat{D}_p})}\mat{\eta}_2, \vec(\ten{X}\circ\ten{X}) \rangle
= \langle c\,\mat{\Omega}, \ten{X}\circ\ten{X} \rangle
\end{displaymath}
which means that
\begin{displaymath}
c \vec{\mat{\Omega}} = \t{(\mat{T}_2\pinv{\mat{D}_p})}\mat{\eta}_2.
\end{displaymath}
The inverse relation is
\begin{displaymath}
\mat{\eta}_2 = c\t{(\pinv{(\mat{T}_2\pinv{\mat{D}_p})})}\vec\mat{\Omega} = c\t{(\pinv{(\mat{T}_2\pinv{\mat{D}_p})})}\mat{D}_p\vech\mat{\Omega},
\end{displaymath}
describing the linear relation between $\mat{\eta}_2$ and $\vech{\mat{\Omega}}$. This gives the following relation between $\mat{\eta}_y = (\mat{\eta}_{1y}, \mat{\eta}_2)$ and $\mat{\xi} = (\vec{\overline{\ten{\eta}}}, \vec{\mat{B}}, \vech{\mat{\Omega}})\in\Xi$ as
\begin{equation}
\mat{\eta}_y = \begin{pmatrix}
\mat{I}_p & \t{(\vec{\ten{F}_y})}\otimes\mat{I}_p & 0 \\
0 & 0 & c\t{(\pinv{(\mat{T}_2\pinv{\mat{D}_p})})}\mat{D}_p
\end{pmatrix}\begin{pmatrix}
\vec{\overline{\ten{\eta}}} \\
\vec{\mat{B}} \\
\vech{\mat{\Omega}}
\end{pmatrix} =: \mat{F}(y)\mat{\xi} \label{eq:eta-to-xi-linear-relation}
\end{equation}
where $\mat{F}(y)$ is a $(p + d)\times p (p + 2 q + 3) / 2$ dimensional matrix valued function in $y$. Moreover, for every $y$ the matrix $\mat{F}(y)$ is of full rank $p + d$.
The log-likelihood of model \eqref{eq:quad-density} for the unconstrained parameters $\xi\in\Xi$ is
\begin{displaymath}
l_n(\mat{\xi})
= \frac{1}{n}\sum_{i = 1}^{n} (\langle \mat{t}(\ten{X}), \mat{\eta}_{y} \rangle - b(\mat{\eta}_y))
=: \frac{1}{n}\sum_{i = 1}^{n} m_{\mat{\xi}}(Z_i)
\end{displaymath}
where $Z_i = (\ten{X}_i, Y_i)$. Using \eqref{eq:eta-to-xi-linear-relation} we can write
\begin{displaymath}
m_{\mat{\xi}}(z) = \langle\mat{t}(\ten{X}), \mat{F}(y)\mat{\xi}\rangle - b(\mat{F}(y)\mat{\xi}).
\end{displaymath}
The following are the regularity conditions for the log-likelihood required by \cref{thm:asymptotic-normality-gmlm}.
\begin{condition}\label{cond:differentiable-and-convex}
The mapping $\mat{\xi}\mapsto m_{\mat{\xi}}(z)$ is twice continuously differentiable for almost every $z$ and $z\mapsto m_{\mat{\xi}}(z)$ is measurable. Moreover, $\mat{\eta}\mapsto b(\mat{\eta})$ is strictly convex. \todo{Furthermore, for every $\widetilde{\mat{\eta}}$ holds $P(\mat{F}(Y)\mat{\xi} = \widetilde{\mat{\eta}}) < 1$. Do I need this???}
\end{condition}
\begin{condition}\label{cond:moments}
It holds $\E\|\t{\mat{t}(\ten{X})}\mat{F}(Y)\| < \infty$ and $\E\|\t{\mat{t}(\ten{X})}\mat{F}(Y)\|^2 < \infty$.
\end{condition}
\begin{condition}\label{cond:finite-sup-on-compacta}
The mapping $\mat{\eta}\mapsto b(\mat{\eta})$ is twice continuously differentiable and for every non-empty compact $K\subseteq\Xi$ holds
\begin{gather*}
\E\sup_{\mat{\xi}\in K}\|b(\mat{F}(Y)\mat{\xi})\| < \infty, \qquad
\E\sup_{\mat{\xi}\in K}\|\t{\nabla b(\mat{F}(Y)\mat{\xi})}\mat{F}(Y)\|^2 < \infty, \\
\E\sup_{\mat{\xi}\in K}\| \t{\mat{F}(Y)}\nabla^2 b(\mat{F}(Y)\mat{\xi})\mat{F}(Y) \| < \infty.
\end{gather*}
\end{condition}
The following is a technical Lemma used in the proof of \cref{thm:asymptotic-normality-gmlm}.
\begin{lemma}\label{thm:kron-manifold-tangent-space}
Let $\manifold{A}_k\subseteq\mathbb{R}^{p_k\times q_k}\backslash\{\mat{0}\}$ for $k = 1, \ldots, r$ be smooth embedded submanifolds as well as ether a sphere or a cone. Then
\begin{displaymath}
\manifold{K} = \Bigl\{ \bigkron_{k = r}^{1}\mat{A}_k : \mat{A}_k\in\manifold{A}_k \Bigr\}
\end{displaymath}
is an embedded manifold in $\mathbb{R}^{p\times q}$ for $p = \prod_{k = 1}^{r} p_k$ and $q = \prod_{k = 1}^{r} q_k$.
Furthermore, define for $j = 1, \ldots, r$ the matrices
\begin{equation}\label{eq:kron-differential-span}
\mat{\Gamma}_j
= \bigkron_{k = r}^{1}(\mat{I}_{p_k q_k}\mathrm{\ if\ } j = k \mathrm{\ else\ }\vec{\mat{A}_k})
= \bigkron_{k = r}^{j + 1}(\vec{\mat{A}_k})\otimes\mat{I}_{p_j q_j}\otimes\bigkron_{k = j - 1}^{1}(\vec{\mat{A}_k})
\end{equation}
and let $\gamma_j$ be $p_j q_j\times d_j$ matrices with $d_j \geq\dim\manifold{A}_j$ which span the tangent space $T_{\mat{A}_j}\manifold{A}_j$ of $\manifold{A}$ at $\mat{A}_j\in\manifold{A}_j$, that is $\Span\gamma_j = T_{\mat{A}_j}\manifold{A}_j$.
Then, with the permutation matrix $\mat{S}_{\mat{p}, \mat{q}}$ defined in \eqref{eq:S_pq}, the $p q \times \sum_{k = 1}^{r} d_j$ dimensional matrix
% Then, with $\mat{p} = (p_1, \ldots, p_r)$, $\mat{q} = (q_1, \ldots, q_r)$ and $\mat{S}_{\mat{p}, \mat{q}}$ being the permutation matrix from \cref{thm:kron-perm}, the $p q \times \sum_{k = 1}^{r} d_j$ dimensional matrix
\begin{displaymath}
\mat{P}_{\mat{A}} = \mat{S}_{\mat{p}, \mat{q}}\left[\mat{\Gamma}_1\mat{\gamma}_1, \mat{\Gamma}_2\mat{\gamma}_2, \ldots, \mat{\Gamma}_r\mat{\gamma}_r\right]
\end{displaymath}
spans the tangent space $T_{\mat{A}}\manifold{K}$ of $\manifold{K}$ at $\mat{A} = \bigkron_{k = r}^{1}\mat{A}_k\in\manifold{K}$, in formula $\Span\mat{P}_{\mat{A}} = T_{\mat{A}}\manifold{K}$.
\end{lemma}
\begin{proof}
% The statement that $\manifold{K}$ is an embedded manifold as well as its dimension follows via induction using \cref{thm:kron-manifolds}.
The statement that $\manifold{K}$ is an embedded manifold follows via induction using \cref{thm:kron-manifolds}.
We compute the differential of the vectorized Kronecker product using \cref{thm:kron-perm} where $\mat{S}_{\mat{p}, \mat{q}}$ is the permutation \eqref{eq:S_pq} defined therein.
\begin{multline*}
\d\vec\bigotimes_{k = r}^{1}\mat{A}_k
= \vec\sum_{j = 1}^{r}\bigkron_{k = r}^{1}(\ternary{k = j}{\d\mat{A}_j}{\mat{A}_k}) \\
= \mat{S}_{\mat{p}, \mat{q}}\vec\sum_{j = 1}^{r}\Bigl(\bigouter_{k = 1}^{r}(\ternary{k = j}{\d\mat{A}_j}{\mat{A}_k})\Bigr)
= \mat{S}_{\mat{p}, \mat{q}}\sum_{j = 1}^{r}\bigkron_{k = r}^{1}(\ternary{k = j}{\vec\d\mat{A}_j}{\vec\mat{A}_k}) \\
= \mat{S}_{\mat{p}, \mat{q}}\sum_{j = 1}^{r}\Bigl(\bigkron_{k = r}^{1}(\ternary{k = j}{\mat{I}_{p_j q_j}}{\vec\mat{A}_k})\Bigr)\vec\d\mat{A}_j
= \mat{S}_{\mat{p}, \mat{q}}\sum_{j = 1}^{r}\mat{\Gamma}_j\vec\d\mat{A}_j \\
= \mat{S}_{\mat{p}, \mat{q}}[\mat{\Gamma}_1, \ldots, \mat{\Gamma}_r]\begin{pmatrix}
\vec\d\mat{A}_1 \\ \vdots \\ \vec\d\mat{A}_r
\end{pmatrix}
\end{multline*}
Due to the definition of the manifold this differential provides the gradient of a surjective map into the manifold. The span of the gradient then spans the tangent space.
Now, we take a closer look at the differentials $\vec{\d\mat{A}_j}$ for $j = 1, \ldots, r$. Let $\varphi_j$ be a chart of $\manifold{A}_j$ in a neighborhood of $\mat{A}_j$. Then, $\mat{A}_j = \varphi_j^{-1}(\varphi_j(\mat{A}_j))$ which gives
\begin{displaymath}
\vec{\d\mat{A}_j} = \t{\nabla\varphi_j^{-1}(\varphi_j(\mat{A}_j))}\vec\d\varphi_j(\mat{A}_j).
\end{displaymath}
Therefore, for every matrix $\mat{\gamma}_j$ such that $\Span{\mat{\gamma}_j} = T_{\mat{A}_j}\manifold{A}_j$ holds $\Span{\t{\nabla\varphi_j^{-1}(\varphi_j(\mat{A}_j))}} = \Span{\mat{\gamma}_j}$ by \cref{def:tangent-space} of the tangent space. We get
\begin{displaymath}
\Span\mat{S}_{\mat{p}, \mat{q}}[\mat{\Gamma}_1, \ldots, \mat{\Gamma}_r]\begin{pmatrix}
\vec\d\mat{A}_1 \\ \vdots \\ \vec\d\mat{A}_r
\end{pmatrix}
=
\Span\mat{S}_{\mat{p}, \mat{q}}[\mat{\Gamma}_1\mat{\gamma}_1, \ldots, \mat{\Gamma}_r\mat{\gamma}_r]
=
\Span\mat{P}_{\mat{A}}
\end{displaymath}
which concludes the proof.
\end{proof}
\begin{proof}[Proof of \cref{thm:asymptotic-normality-gmlm}]
The proof consists of three parts. First, we show the existence of a consistent strong M-estimator by applying \cref{thm:M-estimator-consistency-on-subsets}. Next, we apply \cref{thm:M-estimator-asym-normal-on-manifolds} to obtain its asymptotic normality. We conclude by computing the missing parts of the asymtotic covariance matrix $\mat{\Sigma}_{\mat{\theta}_0}$ provided by \cref{thm:M-estimator-asym-normal-on-manifolds}.
We check whether the conditions of \cref{thm:M-estimator-consistency-on-subsets} are satisfied. On $\Xi$, the mapping $\mat{\xi}\mapsto m_{\mat{\xi}}(z) = m_{\mat{\xi}}(\ten{X},y) = \langle \mat{F}(y)\mat{\xi}, \mat{t}(\ten{X}) \rangle - b(\mat{F}(y)\mat{\xi})$ is strictly concave for every $z$ because $\mat{\xi}\mapsto\mat{F}(y)\mat{\xi}$ is linear and $b$ is strictly convex by \cref{cond:differentiable-and-convex}. Since $\ten{X} \mid Y$ is distributed according to \eqref{eq:quadratic-exp-fam}, the function $M(\mat{\xi}) = \E m_{\mat{\xi}}(Z)$ is well defined by \cref{cond:moments}. Let $\mat{\xi}_k = (\vec{\overline{\ten{\eta}}_k}, \vec{\mat{B}_k}, \vech{\mat{\Omega}_k})$, and $f_{\mat{\xi}_k}$ be the pdf of $\ten{X} \mid Y$ indexed by $\mat{\xi}_k$, for $k = 1, 2$. If $\mat{\xi}_1\ne \mat{\xi}_2$, then $f_{\mat{\xi}_1} \neq f_{\mat{\xi}_2}$, which obtains that the true $\mat{\theta}_0$ is a unique maximizer of $\mat{\theta}_0\in\Theta\subseteq\Xi$ by applying \textcite[Lemma~5.35]{asymStats-van_der_Vaart1998}. Finally, under \cref{cond:finite-sup-on-compacta}, all assumptions of \cref{thm:M-estimator-consistency-on-subsets} are fulfilled yielding the existence of an consistent strong M-estimator over $\Theta\subseteq\Xi$.
Next, let $\hat{\mat{\theta}}_n$ be a strong M-estimator on $\Theta\subseteq\Xi$, whose existence and consistency was shown in the previous step. Since $z\mapsto m_{\mat{\xi}}(z)$ is measurable for all $\mat{\xi}\in\Xi$, it is also measurable in a neighborhood of $\mat{\theta}_0$. The differentiability of $\mat{\theta}\mapsto m_{\mat{\theta}}(z)$ is stated in \cref{cond:differentiable-and-convex}. For the Lipschitz condition, let $K\subseteq\Xi$ be a compact neighborhood of $\mat{\theta}_0$, which exists since $\Xi$ is open. Then,
\begin{align*}
\left| m_{\mat{\theta}_1}(z) - m_{\mat{\theta}_2}(z) \right|
&= \left| \langle \mat{t}(\ten{X}), \mat{F}(y)(\mat{\theta}_1 - \mat{\theta}_2) \rangle - b(\mat{F}(z)\mat{\theta}_1) + b(\mat{F}(z)\mat{\theta}_2) \right| \\
&\leq (\| \t{\mat{F}(y)}\mat{t}(\ten{X}) \|_2 + \sup_{\mat{\theta}\in K}\| \nabla b(\mat{F}(y)\mat{\theta}) \mat{F}(y)\| ) \| \mat{\theta}_1 - \mat{\theta}_2 \|_2
=: u(z)\| \mat{\theta}_1 - \mat{\theta}_2 \|_2
\end{align*}
with $u(z)$ being measurable and square integrable derives from \cref{cond:finite-sup-on-compacta}. The existence of a second-order Taylor expansion of $\mat{\theta}\mapsto M(\mat{\theta}) = \E m_{\mat{\theta}}(Z)$ in a neighborhood of $\mat{\theta}_0$ holds by \cref{cond:finite-sup-on-compacta}. Moreover, the Hessian $\mat{H}_{\mat{\theta}_0}$ is non-singular by the strict convexity of $b$ stated in \cref{cond:differentiable-and-convex}. Now, we can apply \cref{thm:M-estimator-asym-normal-on-manifolds} to obtain the asymptotic normality of $\sqrt{n}(\hat{\mat{\theta}}_n - \mat{\theta}_0)$ with variance-covariance structure
\begin{equation}\label{eq:asymptotic-covariance-gmlm}
\mat{\Sigma}_{\mat{\theta}_0} = \mat{\Pi}_{\mat{\theta}_0} \E[\nabla m_{\mat{\theta}_0}(Z)\t{(\nabla m_{\mat{\theta}_0}(Z))}]\mat{\Pi}_{\mat{\theta}_0}
\end{equation}
where $\mat{\Pi}_{\mat{\theta}_0} = \mat{P}_{\mat{\theta}_0}(\t{\mat{P}_{\mat{\theta}_0}}\mat{H}_{\mat{\theta}_0}\mat{P}_{\mat{\theta}_0})^{-1}\t{\mat{P}_{\mat{\theta}_0}}$ and $\mat{P}_{\mat{\theta}_0}$ is any $p\times \dim(\Theta)$ matrix such that it spans the tangent space of $\Theta$ at $\mat{\theta}_0$. That is, $\Span \mat{P}_{\mat{\theta}_0} = T_{\mat{\theta}_0}\Theta$.
Finally, we compute a matrix $\mat{P}_{\mat{\theta}_0}$ such that $\Span{\mat{P}_{\mat{\theta}_0}} = T_{\mat{\theta}_0}\Theta$ for $\Theta = \mathbb{R}^p\times\manifold{K}_{\mat{B}}\times\manifold{CK}_{\mat{\Omega}}$ as in \cref{thm:param-manifold}. Since the manifold $\Theta$ is a product manifold we get a block diagonal structure for $\mat{P}_{\mat{\theta}_0}$ as
\begin{displaymath}
\mat{P}_{\mat{\theta}_0} = \begin{pmatrix}
\mat{I}_p & 0 & 0 \\
0 & \mat{P}_{\mat{B}_0} & 0 \\
0 & 0 & \mat{P}_{\mat{\Omega}_0}
\end{pmatrix}
\end{displaymath}
where $\mat{I}_p$ is the identity matrix spanning the tangent space of $\mathbb{R}^p$, which is identified with $\mathbb{R}^p$ itself. The blocks $\mat{P}_{\mat{B}_0}$ and $\mat{P}_{\mat{\Omega}_0}$ need to span the tangent spaces of $\manifold{K}_{\mat{B}}$ and $\manifold{CK}_{\mat{\Omega}}$, respectively. Both $\manifold{K}_{\mat{B}}$ and $\manifold{CK}_{\mat{\Omega}}$ are manifolds according to \cref{thm:kron-manifolds} under the cone condition. The constraint manifold $\manifold{CK}_{\mat{\Omega}}$ is the intersection of $\manifold{K}_{\mat{\Omega}}$ with the span of the projection $\pinv{(\mat{T}_2\pinv{\mat{D}_p})}\mat{T}_2\pinv{\mat{D}_p}$ meaning that the differential $\vec{\d\mat{\Omega}}$ on $\manifold{CK}_{\mat{\Omega}}$ fulfills $\vec{\d\mat{\Omega}} = \pinv{(\mat{T}_2\pinv{\mat{D}_p})}\mat{T}_2\pinv{\mat{D}_p}\vec{\d\mat{\Omega}}$. Now, we can apply \cref{thm:kron-manifold-tangent-space} for $\manifold{K}_{\mat{B}}$ and $\manifold{K}_{\mat{\Omega}}$ which give
\begin{align*}
\mat{P}_{\mat{B}_0} &= \mat{S}_{\mat{p}, \mat{q}}[\mat{\Gamma}_{\mat{\beta}_1}\mat{\gamma}_{\mat{\beta}_1}, \ldots, \mat{\Gamma}_{\mat{\beta}_r}\mat{\gamma}_{\mat{\beta}_r}], \\
\mat{P}_{\mat{\Omega}_0} &= \pinv{(\mat{T}_2\pinv{\mat{D}_p})}\mat{T}_2\pinv{\mat{D}_p}\mat{S}_{\mat{p}, \mat{p}}[\mat{\Gamma}_{\mat{\Omega}_1}\mat{\gamma}_{\mat{\Omega}_1}, \ldots, \mat{\Gamma}_{\mat{\Omega}_r}\mat{\gamma}_{\mat{\Omega}_r}]
\end{align*}
where the matrices $\mat{S}_{\mat{p}, \mat{q}}$, $\mat{\Gamma}_{\mat{\beta}_j}$, $\mat{\gamma}_{\mat{\beta}_j}$, $\mat{\Gamma}_{\mat{\Omega}_j}$ and $\mat{\gamma}_{\mat{\Omega}_j}$ are described in \cref{thm:kron-manifold-tangent-space} for the Kronecker manifolds $\manifold{K}_{\mat{B}}$ and $\manifold{K}_{\mat{\Omega}}$. Leading to
\begin{equation}\label{eq:param-manifold-span}
\mat{P}_{\mat{\theta}_0} = \begin{pmatrix}
\mat{I}_p & 0 & 0 \\
0 & \mat{S}_{\mat{p}, \mat{q}}[\mat{\Gamma}_{\mat{\beta}_1}\mat{\gamma}_{\mat{\beta}_1}, \ldots, \mat{\Gamma}_{\mat{\beta}_r}\mat{\gamma}_{\mat{\beta}_r}] & 0 \\
0 & 0 & \pinv{(\mat{T}_2\pinv{\mat{D}_p})}\mat{T}_2\pinv{\mat{D}_p}\mat{S}_{\mat{p}, \mat{p}}[\mat{\Gamma}_{\mat{\Omega}_1}\mat{\gamma}_{\mat{\Omega}_1}, \ldots, \mat{\Gamma}_{\mat{\Omega}_r}\mat{\gamma}_{\mat{\Omega}_r}]
\end{pmatrix}.
\end{equation}
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\printbibliography[heading=bibintoc, title={References}]
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\end{document}