\documentclass[aos]{imsart} %% Packages \usepackage[utf8]{inputenc} \usepackage[LSF, T1]{fontenc} % \usepackage{lmodern} % TODO: interfers with "imsart" classed author address display \usepackage{amsthm, amsmath, amsfonts, amssymb, bm, pifont} \usepackage{float} \usepackage{chessfss} \usepackage{scalerel} \usepackage[dvipsnames]{xcolor} \usepackage{graphicx} \usepackage[authoryear]{natbib} % External References to main file \usepackage{xr} \externaldocument{main} \usepackage[colorlinks, citecolor = blue, urlcolor = blue]{hyperref} \usepackage[noabbrev, capitalize, nameinlink]{cleveref} % after hyperref \startlocaldefs %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% %% Uncomment next line to change %% %% the type of equation numbering %% %% %% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %\numberwithin{equation}{section} %% %% %% For Axiom, Claim, Corollary, Hypothesis, %% %% Lemma, Theorem, Proposition %% %% use \theoremstyle{plain} %% %% %% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \theoremstyle{plain} \newtheorem{theorem}{Theorem} \newtheorem{lemma}{Lemma} \newtheorem{corollary}{Corollary} \newtheorem{proposition}{Proposition} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% %% For Assumption, Definition, Example, %% %% Notation, Property, Remark, Fact %% %% use \theoremstyle{remark} %% %% %% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \theoremstyle{remark} \newtheorem{definition}{Definition} \newtheorem{condition}{Condition} \newtheorem{example}{Example} \newtheorem{remark}{Remark} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% Please put your definitions here: %% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% Clever ref additional reference name \crefname{condition}{Condition}{Conditions} \Crefname{condition}{Condition}{Conditions} \crefrangelabelformat{condition}{#3#1#4-#5#2#6} % matrices \newcommand*{\mat}[1]{\boldsymbol{#1}} % tensors (special case for lower case caligraphic letters) \newcommand*{\ten}[1]{ \ifnum\pdfstrcmp{#1}{`}=1 % lowercase argument \mathfrak{#1} \else % uppercase argument \mathcal{#1} \fi } \newcommand{\manifold}[1]{\mathfrak{#1}} % Define math macros \renewcommand{\hat}{\widehat} \renewcommand*{\vec}{\operatorname{vec}} \newcommand*{\vech}{\operatorname{vech}} \newcommand*{\rank}{\operatorname{rank}} \newcommand*{\diag}{\operatorname{diag}} \DeclareMathOperator{\tr}{tr} \DeclareMathOperator{\var}{Var} \DeclareMathOperator{\cov}{Cov} \DeclareMathOperator{\Span}{span} \DeclareMathOperator{\E}{\operatorname{\mathbb{E}}} \DeclareMathOperator*{\argmax}{{arg\,max}} \newcommand*{\D}{\textnormal{D}} % derivative \renewcommand*{\H}{\textnormal{H}} % hessian \renewcommand*{\d}{\textnormal{d}} % differential \renewcommand*{\t}[1]{{#1^{T}}} % matrix transpose \newcommand*{\pinv}[1]{{#1^{\dagger}}} % `Moore-Penrose pseudoinverse` % rearangment operator, generalization of Van-Loan and Pitzianis rearrangement opreration \newcommand*{\K}{\mathcal{K}} % ternary operator (C style argments: ? : ) \newcommand{\ternary}[3]{{#2}{\ \mathrm{if}\ }{#1}{\ \mathrm{else}\ }{#3}} \renewcommand{\checkmark}{{\color{Green}\ding{51}}} \newcommand{\xmark}{{\color{Red!70}\ding{55}}} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \newcommand{\efi}[1]{{\color{teal}Effie: #1}} \newcommand{\daniel}[1]{{\color{red!70!black}Daniel: #1}} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Special Matrix Sets (Manifolds) \newcommand{\StiefelNonCompact}[2]{\mathbb{R}_{*}^{{#1}\times {#2}}} \newcommand{\Stiefel}[2]{\mathrm{St}^{{#1}\times {#2}}} \newcommand{\SymMat}[1]{\mathrm{Sym}^{{#1}\times {#1}}} \newcommand{\SymPosDefMat}[1]{\mathrm{Sym}_{++}^{{#1}\times {#1}}} \newcommand{\OrthogonalGrp}[1]{\mathrm{O}(#1)} \newcommand{\SpecialOrthogonalGrp}[1]{\mathrm{SO}(#1)} %%% Custom operators with ether one or two arguments (limits) \makeatletter %%% Multi-Linear Multiplication % $\mlm_{k \in [r]}$ or $\mlm_{k = 1}^{r}$ (lower limit MUST be the first!) % Save first argument as \arg@one \def\mlm_#1{\def\arg@one{#1}\futurelet\next\mlm@i} % Check for second argument \def\mlm@i{\ifx\next^\expandafter\mlm@two\else\expandafter\mlm@one\fi} % specialization for one or two arguments, both versions use saved first argument \def\mlm@one{\mathchoice% {\operatorname*{\scalerel*[1.65em]{\times}{\bigotimes}}_{\makebox[0pt][c]{$\scriptstyle \arg@one$}}}% {\operatorname*{\scalerel*[1.65em]{\times}{\bigotimes}}_{\arg@one}}% {\operatorname*{\scalerel*[1.65em]{\times}{\bigotimes}}_{\arg@one}}% {\operatorname*{\scalerel*[1.65em]{\times}{\bigotimes}}_{\arg@one}}% } % this commands single argument is the second argument of \mlm, it gobbles the `^` \def\mlm@two^#1{\mathchoice% {\operatorname*{\scalerel*[1.65em]{\times}{\bigotimes}}_{\makebox[0pt][c]{$\scriptstyle \arg@one$}}^{\makebox[0pt][c]{$\scriptstyle #1$}}}% {\operatorname*{\scalerel*[1.65em]{\times}{\bigotimes}}_{\arg@one}^{#1}}% {\operatorname*{\scalerel*[1.65em]{\times}{\bigotimes}}_{\arg@one}^{#1}}% {\operatorname*{\scalerel*[1.65em]{\times}{\bigotimes}}_{\arg@one}^{#1}}% } %%% Big Circle (Iterated Outer Product) \def\bigouter_#1{\def\arg@one{#1}\futurelet\next\bigouter@i} \def\bigouter@i{\ifx\next^\expandafter\bigouter@two\else\expandafter\bigouter@one\fi} \def\bigouter@one{\mathchoice% {\operatorname*{\scalerel*{\circ}{\bigotimes}}_{\makebox[0pt][c]{$\scriptstyle \arg@one$}}}% {\operatorname*{\scalerel*{\circ}{\bigotimes}}_{\arg@one}}% {\operatorname*{\scalerel*{\circ}{\bigotimes}}_{\arg@one}}% {\operatorname*{\scalerel*{\circ}{\bigotimes}}_{\arg@one}}% } \def\bigouter@two^#1{\mathchoice% {\operatorname*{\scalerel*{\circ}{\bigotimes}}_{\makebox[0pt][c]{$\scriptstyle \arg@one$}}^{\makebox[0pt][c]{$\scriptstyle #1$}}}% {\operatorname*{\scalerel*{\circ}{\bigotimes}}_{\arg@one}^{#1}}% {\operatorname*{\scalerel*{\circ}{\bigotimes}}_{\arg@one}^{#1}}% {\operatorname*{\scalerel*{\circ}{\bigotimes}}_{\arg@one}^{#1}}% } %%% Big Kronecker Product (with overflowing limits) % Save first argument as \arg@one \def\bigkron_#1{\def\arg@one{#1}\futurelet\next\bigkron@i} % Check for second argument \def\bigkron@i{\ifx\next^\expandafter\bigkron@two\else\expandafter\bigkron@one\fi} % specialization for one or two arguments, both versions use saved first argument \def\bigkron@one{\mathchoice% {\bigotimes_{\makebox[0pt][c]{$\scriptstyle \arg@one$}}}% {\bigotimes_{\arg@one}}% {\bigotimes_{\arg@one}}% {\bigotimes_{\arg@one}}% } % this commands single argument is the second argument of \bigkron \def\bigkron@two^#1{\mathchoice% {\bigotimes_{\makebox[0pt][c]{$\scriptstyle \arg@one$}}^{\makebox[0pt][c]{$\scriptstyle #1$}}}% {\bigotimes_{\arg@one}^{#1}}% {\bigotimes_{\arg@one}^{#1}}% {\bigotimes_{\arg@one}^{#1}}% } \makeatother %%% "Fix" additional spacing around \left(...\right), % see: https://tex.stackexchange.com/questions/2607/spacing-around-left-and-right \let\originalleft\left \let\originalright\right \renewcommand{\left}{\mathopen{}\mathclose\bgroup\originalleft} \renewcommand{\right}{\aftergroup\egroup\originalright} %%% Sets the max nr. of columns in AMSmath's matrix environments to 20 \setcounter{MaxMatrixCols}{20} \endlocaldefs \begin{document} \begin{frontmatter} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% %% Enter the title of your article here %% %% %% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \title{Supplement to ``Generalized Multilinear Models for Sufficient Dimension Reduction on Tensor-valued Predictors''} \runtitle{GMLM for SDR on tensor-valued Predictors} \begin{aug} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% Only one address is permitted per author. %% %% Only division, organization and e-mail is %% %% included in the address. %% %% Additional information can be included in %% %% the Acknowledgments section if necessary. %% %% ORCID can be inserted by command: %% %% \orcid{0000-0000-0000-0000} %% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \author[A]{\fnms{Daniel}~\snm{Kapla}\ead[label=es1]{daniel.kapla@tuwien.ac.at}} \and \author[A]{\fnms{Efstathia}~\snm{Bura}\ead[label=es2]{efstathia.bura@tuwien.ac.at}} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% Addresses %% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \address[A]{Institute of Statistics and Mathematical Methods in Economics, Faculty of Mathematics and Geoinformation, TU Wien \\ \printead[presep={\ }]{es1,es2}} \end{aug} \end{frontmatter} %%% Set counters to continue from main paper counters (after frontmatter!) \setcounter{theorem}{5} \setcounter{definition}{2} \setcounter{figure}{3} \setcounter{equation}{19} \setcounter{footnote}{10} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% Multiple Appendixes: %% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{appendix} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Examples}\label{app:examples} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{example}[Vectorization]\label{ex:vectorization} Given a matrix \begin{displaymath} \mat{A} = \begin{pmatrix} 1 & 4 & 7 \\ 2 & 5 & 8 \\ 3 & 6 & 9 \end{pmatrix} \end{displaymath} its vectorization is $\vec{\mat{A}} = \t{(1, 2, 3, 4, 5, 6, 7, 8, 9)}$ and its half vectorization $\vech{\mat{A}} = \t{(1, 2, 3, 5, 6, 9)}$. Let $\ten{A}$ be a tensor of dimension $3\times 3\times 3$ given by \begin{displaymath} \ten{A}_{:,:,1} = \begin{pmatrix} 1 & 4 & 7 \\ 2 & 5 & 8 \\ 3 & 6 & 9 \end{pmatrix}, \qquad \ten{A}_{:,:,2} = \begin{pmatrix} 10 & 13 & 16 \\ 11 & 14 & 17 \\ 12 & 15 & 18 \end{pmatrix}, \qquad \ten{A}_{:,:,3} = \begin{pmatrix} 19 & 22 & 25 \\ 20 & 23 & 26 \\ 21 & 24 & 27 \end{pmatrix}. \end{displaymath} Then the vectorization of $\ten{A}$ is given by \begin{displaymath} \vec{\ten{A}} = \t{(1, 2, 3, 4, ..., 26, 27)}\in\mathbb{R}^{27}. \end{displaymath} \end{example} \begin{example}[Matricization] Let $\ten{A}$ be the $3\times 4\times 2$ tensor given by \begin{displaymath} \ten{A}_{:,:,1} = \begin{pmatrix} 1 & 4 & 7 & 10 \\ 2 & 5 & 8 & 11 \\ 3 & 6 & 9 & 12 \end{pmatrix}, \ten{A}_{:,:,2} = \begin{pmatrix} 13 & 16 & 19 & 22 \\ 14 & 17 & 20 & 23 \\ 15 & 18 & 21 & 24 \end{pmatrix}. \end{displaymath} Its matricizations are then \begin{gather*} \ten{A}_{(1)} = \begin{pmatrix} 1 & 4 & 7 & 10 & 13 & 16 & 19 & 22 \\ 2 & 5 & 8 & 11 & 14 & 17 & 20 & 23 \\ 3 & 6 & 9 & 12 & 15 & 18 & 21 & 24 \end{pmatrix}, \qquad \ten{A}_{(2)} = \begin{pmatrix} 1 & 2 & 3 & 13 & 14 & 15 \\ 4 & 5 & 6 & 16 & 17 & 18 \\ 7 & 8 & 9 & 19 & 20 & 21 \\ 10 & 11 & 12 & 22 & 23 & 24 \end{pmatrix}, \\ \ten{A}_{(3)} = \begin{pmatrix} 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10 & 11 & 12 \\ 13 & 14 & 15 & 16 & 17 & 18 & 19 & 20 & 21 & 22 & 23 & 24 \end{pmatrix}. \end{gather*} \end{example} % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % \section{Multi Linear Algebra}\label{app:multi-linear-algebra} % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % % \begin{displaymath} % (\ten{A}\circ\ten{B})\mlm_{k = 1}^{r + s} \mat{C}_k % = % \Bigl(\ten{A}\mlm_{k = 1}^r \mat{C}_k\Bigr)\circ\Bigl(\ten{B}\mlm_{l = 1}^s \mat{C}_{l + r}\Bigr) % \end{displaymath} % Using $\K(\ten{A}\circ\ten{B}) = \ten{A}\otimes\ten{B}$ gives % \begin{displaymath} % \K\Bigl((\ten{A}\circ\ten{B})\mlm_{k = 1}^{r + s} \mat{C}_k\Bigr) % = % \Bigl(\ten{A}\mlm_{k = 1}^r \mat{C}_k\Bigr)\otimes\Bigl(\ten{B}\mlm_{l = 1}^s \mat{C}_{l + r}\Bigr) % \end{displaymath} % A generalization of the well-known identity $\vec(\mat{A}\mat{B}\mat{C}) = (\t{\mat{C}}\otimes\mat{A})\vec{\mat{B}}$ is given by % \begin{displaymath} % \Bigl(\ten{A}\mlm_{k = 1}^r \mat{B}_k \Bigr)_{(\mat{i}, \mat{j})} % = % \Bigl( \bigotimes_{k = \#\mat{i}}^{1}\mat{B}_{\mat{i}_k} \Bigr) % \ten{A}_{(\mat{i}, \mat{j})} % \Bigl( \bigotimes_{l = \#\mat{j}}^{1}\t{\mat{B}_{\mat{j}_l}} \Bigr) % \end{displaymath} % with the special case % \begin{displaymath} % \vec\Bigl(\ten{A}\mlm_{k = 1}^r \mat{B}_k\Bigr) % = % \Bigl(\bigotimes_{k = r}^{1}\mat{B}_k\Bigr)\vec{\ten{A}} % \end{displaymath} % Furthermore, we have % \begin{displaymath} % (\ten{A}\otimes\ten{B})\mlm_{k = 1}^{r}\t{(\vec\mat{C}_k)} % = % \Bigl\langle \ten{A}\mlm_{k = 1}^{r} \mat{C}_k, \ten{B} \Bigr\rangle % = % \Bigl\langle \ten{A}, \ten{B}\mlm_{k = 1}^{r} \t{\mat{C}_k} \Bigr\rangle % = % \t{(\vec{\ten{B}})}\Bigl(\bigotimes_{k = r}^{1}\mat{C}_k\Bigr)\vec{\ten{A}} % \end{displaymath} % as well as for any tensor $\ten{A}$ of even order $2 r$ and matching square matrices $\mat{B}_k$ holds % \begin{displaymath} % \K(\ten{A})\mlm_{k = 1}^{r}\t{(\vec\mat{B}_k)} % = % \t{(\vec{\ten{A}})}\vec\Bigl(\bigotimes_{k = r}^{1}\t{\mat{B}_k}\Bigr) % \end{displaymath} % \begin{lemma}\label{thm:kron-perm} % Given $r \geq 2$ matrices $\mat{A}_k$ of dimension $p_j\times q_j$ for $k = 1, \ldots, r$, then there exists a unique permutation matrix $\mat{S}_{\mat{p}, \mat{q}}$ such that % \begin{equation}\label{eq:kron-to-outer-perm} % \vec\bigkron_{k = r}^{1}\mat{A}_k = \mat{S}_{\mat{p}, \mat{q}}\vec\bigouter_{k = 1}^{r}\mat{A}_k. % \end{equation} % The permutation $\mat{S}_{\mat{p}, \mat{q}}$ with indices $\mat{p} = (p_1, \ldots, p_r)$ and $\mat{q} = (q_1, \ldots, q_r)$ is defined recursively as % \begin{equation}\label{eq:S_pq} % \mat{S}_{\mat{p}, \mat{q}} = \mat{S}_{\bigl( \prod_{k = 1}^{r - 1}p_k, p_r \bigr), \bigl( \prod_{k = 1}^{r - 1}q_k, q_r \bigr)} \bigl(\mat{I}_{p_r q_r}\otimes\mat{S}_{(p_1, \ldots, p_{r-1}), (q_1, \ldots, q_{r-1})}\bigr) % \end{equation} % with initial value % \begin{displaymath} % \mat{S}_{(p_1, p_2), (q_1, q_2)} = \mat{I}_{q_2}\otimes\mat{K}_{q_1, p_2}\otimes\mat{I}_{p_1} % \end{displaymath} % where $\mat{K}_{p, q}$ is the \emph{commutation matrix} \cite[][Ch.~11]{AbadirMagnus2005}, that is the permutation such that $\vec{\t{\mat{A}}} = \mat{K}_{p, q}\vec{\mat{A}}$ for every $p\times q$ dimensional matrix $\mat{A}$. % \end{lemma} % \begin{proof} % Lemma~7 in \cite{MagnusNeudecker1986} states that % \begin{align} % \vec(\mat{A}_2\otimes\mat{A}_1) % &= (\mat{I}_{q_2}\otimes\mat{K}_{q_1, p_2}\otimes\mat{I}_{p_1})(\vec{\mat{A}_2}\otimes\vec{\mat{A}_1}) \label{eq:MagnusNeudecker1986-vec-kron-identity} \\ % &= (\mat{I}_{q_2}\otimes\mat{K}_{q_1, p_2}\otimes\mat{I}_{p_1})\vec(\mat{A}_1\circ \mat{A}_2). \nonumber % \end{align} % This proves the statement for $r = 2$. The general statement for $r > 2$ follows via induction. Assuming \eqref{eq:kron-to-outer-perm} holds for $r - 1$, the induction step is then; % \begin{multline*} % \vec{\bigkron_{k = r}^{1}}\mat{A}_k % = \vec\Bigl(\mat{A}_r\otimes\bigkron_{k = r - 1}^{1}\mat{A}_k\Bigr) \\ % \overset{\eqref{eq:MagnusNeudecker1986-vec-kron-identity}}{=} \Bigl( \mat{I}_{q_r}\otimes\mat{K}_{\prod_{k = 1}^{r - 1}q_k, p_r}\otimes\mat{I}_{\prod_{k = 1}^{r - 1}p_k} \Bigr)\vec\Bigl((\vec\mat{A}_r)\otimes\vec\bigkron_{k = r - 1}^{1}\mat{A}_k\Bigr) \\ % = \mat{S}_{\bigl( \prod_{k = 1}^{r - 1}p_k, p_r \bigr), \bigl( \prod_{k = 1}^{r - 1}q_k, q_r \bigr)}\vec\Bigl[\Bigl(\vec\bigkron_{k = r - 1}^{1}\mat{A}_k\Bigr)\t{(\vec\mat{A}_r)}\Bigr] \\ % \overset{\eqref{eq:kron-to-outer-perm}}{=} \mat{S}_{\bigl( \prod_{k = 1}^{r - 1}p_k, p_r \bigr), \bigl( \prod_{k = 1}^{r - 1}q_k, q_r \bigr)}\vec\Bigl[\mat{S}_{(p_1, \ldots, p_{r-1}), (q_1, \ldots, q_{r-1})}\Bigl(\vec\bigouter_{k = 1}^{r - 1}\mat{A}_k\Bigr)\t{(\vec\mat{A}_r)}\Bigr] \\ % \overset{(a)}{=} \mat{S}_{\bigl( \prod_{k = 1}^{r - 1}p_k, p_r \bigr), \bigl( \prod_{k = 1}^{r - 1}q_k, q_r \bigr)} \bigl(\mat{I}_{p_r q_r}\otimes\mat{S}_{(p_1, \ldots, p_{r-1}), (q_1, \ldots, q_{r-1})}\bigr)\vec\Bigl[\Bigl(\vec\bigouter_{k = 1}^{r - 1}\mat{A}_k\Bigr)\t{(\vec\mat{A}_r)}\Bigr] \\ % = \mat{S}_{\mat{p}, \mat{q}}\vec\bigouter_{k = 1}^{r}\mat{A}_k. % \end{multline*} % Equality $(a)$ uses the relation $\vec(\mat{C}\mat{a}\t{\mat{b}}) = (\mat{I}_{\dim(\mat{b})}\otimes\mat{C})\vec(\mat{a}\t{\mat{b}})$ for a matrix $\mat{C}$ and vectors $\mat{a}, \mat{b}$. % \end{proof} % \begin{remark} % The permutation matrix $\mat{K}_{p, q}$ represents a perfect outer $p$-shuffle of $p q$ elements. % \end{remark} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Proofs}\label{app:proofs} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{proof}[\hypertarget{proof:sdr}{Proof of \cref{thm:sdr}}] A direct implication of Theorem~1 from \cite{BuraDuarteForzani2016} is that, under the exponential family \eqref{eq:quad-density} with natural statistic $\mat{t}(\ten{X})$, \begin{displaymath} \t{\mat{\alpha}}(\mat{t}(\ten{X}) - \E\mat{t}(\ten{X})) \end{displaymath} is a sufficient reduction, where $\mat{\alpha}\in\mathbb{R}^{p (p + 3) / 2\times q}$ with $\Span(\mat{\alpha}) = \Span(\{\mat{\eta}_y - \E_{Y}\mat{\eta}_Y : y\in\mathcal{S}_Y\})$ where $\mathcal{S}_Y$ denotes the sample space of $Y$. Since $\E_Y\ten{F}_Y = 0$ and $\E_Y\mat{\eta}_{1 Y} = \E[\vec\overline{\ten{\eta}} - \mat{B}\vec\ten{F}_Y] = \vec\overline{\ten{\eta}}$ is follows that \begin{displaymath} \mat{\eta}_y - \E_{Y}\mat{\eta}_Y = \begin{pmatrix} \mat{\eta}_{1 y} - \E_{Y}\mat{\eta}_{1 Y} \\ \mat{\eta}_{2} - \E_{Y}\mat{\eta}_{2} \end{pmatrix} = \begin{pmatrix} \mat{B}\vec\ten{F}_y \\ \mat{0} \end{pmatrix}. \end{displaymath} because $\mat{\eta}_{2}$ does not depend on $y$. The set $\{ \vec{\ten{F}_y} : y\in\mathcal{S}_Y \}$ is a subset of $\mathbb{R}^q$. Therefore, \begin{displaymath} \Span\left(\{\mat{\eta}_y - \E_{Y}\mat{\eta}_Y : y\in\mathcal{S}_Y\}\right) = \Span\left(\left\{\begin{pmatrix} \mat{B}\vec\ten{F}_Y \\ \mat{0} \end{pmatrix} : y\in\mathcal{S}_Y \right\}\right) \subseteq \Span\begin{pmatrix} \mat{B} \\ \mat{0} \end{pmatrix}, \end{displaymath} which obtains with $\mat{t}(\ten{X}) = (\vec\ten{X}, \vech(\t{(\vec\ten{X})}(\vec\ten{X})))$ that \begin{displaymath} \t{\begin{pmatrix} \mat{B} \\ \mat{0} \end{pmatrix}}(\mat{t}(\ten{X}) - \E\mat{t}(\ten{X})) = \t{\mat{B}}\vec(\ten{X} - \E\ten{X}) = \vec\Bigl(\ten{F}_y\mlm_{k = 1}^{r}\mat{\beta}_k\Bigr) \end{displaymath} is also a sufficient reduction, though not necessarily minimal, using $\mat{B} = \bigkron_{k = 1}^{r}\mat{\beta}_k$. When the exponential family is full rank, which in our setting amounts to all $\mat{\beta}_j$'s being full rank matrices for $j = 1, \ldots, r$, then Theorem~1 from \cite{BuraDuarteForzani2016} also obtains the minimality of the reduction. \end{proof} \begin{proof}[\hypertarget{proof:kron-manifolds}{Proof of \cref{thm:kron-manifolds}}] We start by considering the first case and assume that $\manifold{B}$ is spherical with radius $1$ w.l.o.g. We equip $\manifold{K} = \{ \mat{A}\otimes \mat{B} : \mat{A}\in\manifold{A}, \mat{B}\in\manifold{B} \}\subset\mathbb{R}^{p_1 p_2\times q_1 q_2}$ with the subspace topology.\footnote{Given a topological space $(\manifold{M}, \mathcal{T})$ and a subset $\mathcal{S}\subseteq\manifold{M}$ the \emph{subspace topology} on $\manifold{S}$ induced by $\manifold{M}$ consists of all open sets in $\manifold{M}$ intersected with $\manifold{S}$, that is $\{ O\cap\manifold{S} : O\in\mathcal{T} \}$ \cite[]{Lee2012,Lee2018,Kaltenbaeck2021}.} Define the hemispheres $H_i^{+} = \{ \mat{B}\in\manifold{B} : (\vec{\mat{B}})_i > 0 \}$ and $H_i^{-} = \{ \mat{B}\in\manifold{B} : (\vec{\mat{B}})_i < 0 \}$ for $i = 1, ..., p_2 q_2$. The hemispheres are an open cover of $\manifold{B}$ with respect to the subspace topology. Define for every $H_i^{\pm}$, where $\pm$ is a placeholder for ether $+$ or $-$, the function \begin{displaymath} f_{H_i^{\pm}} : \manifold{A}\times H_i^{\pm}\to\mathbb{R}^{p_1 p_2\times q_1 q_2} : (\mat{A}, \mat{B})\mapsto \mat{A}\otimes \mat{B} \end{displaymath} which is smooth. With the spherical property of $\manifold{B}$ the relation $\|\mat{A}\otimes \mat{B}\|_F = \|\mat{A}\|_F$ for all $\mat{A}\otimes \mat{B}\in\manifold{K}$ ensures that the function $f_{H_i^{\pm}}$, constrained to its image, is bijective with inverse function (identifying $\mathbb{R}^{p\times q}$ with $\mathbb{R}^{p q}$) given by \begin{displaymath} f_{H_i^{\pm}}^{-1} = \begin{cases} f_{H_i^{\pm}}(\manifold{A}\times H_i^{\pm})\to\manifold{A}\times H_i^{\pm} \\ \mat{C}\mapsto \left(\pm\frac{\|\mat{C}\|_F}{\|\mat{R}(\mat{C})\mat{e}_i\|_2}\mat{R}(\mat{C})\mat{e}_i, \pm\frac{1}{\|\mat{C}\|_F\|\mat{R}(\mat{C})\mat{e}_i\|_2}\mat{R}(\mat{C})\t{\mat{R}(\mat{C})}\mat{e}_i\right) \end{cases} \end{displaymath} where $\pm$ is $+$ for a ``positive'' hemisphere $H_i^{+}$ and $-$ otherwise, $\mat{e}_i\in\mathbb{R}^{p_2 q_2}$ is the $i$th unit vector and $\mat{R}(\mat{C})$ is a ``reshaping'' permutation\footnote{Relating to $\K$ the operation $\mat{R}$ is basically its inverse as $\K(\mat{A}\circ\mat{B}) = \mat{A}\otimes\mat{B}$ with a mismatch in the shapes only.} which acts on Kronecker products as $\mat{R}(\mat{A}\otimes \mat{B}) = (\vec{\mat{A}})\t{(\vec{\mat{B}})}$. This makes $f_{H_i^{\pm}}^{-1}$ a combination of smooth functions ($\mat{0}$ is excluded from $\manifold{A}, \manifold{B}$ guarding against division by zero) and as such it is also smooth. This ensures that $f_{H_i^{\pm}} : \manifold{A}\times {H_i^{\pm}}\to f_{H_i^{\pm}}(\manifold{A}\times {H_i^{\pm}})$ is a diffeomorphism. Next, we construct an atlas\footnote{A collection of charts $\{ \varphi_i : i\in I \}$ with index set $I$ of a manifold $\manifold{A}$ is called an \emph{atlas} if the pre-images of the charts $\varphi_i$ cover the entire manifold $\manifold{A}$.} for $\manifold{K}$ which is equipped with the subspace topology. Let $(\varphi_j, U_j)_{j\in J}$ be a atlas of $\manifold{A}\times\manifold{B}$. Such an atlas exists and admits a unique smooth structure as both $\manifold{A}, \manifold{B}$ are embedded manifolds from which we take the product manifold. The images of the coordinate domains $f_H(U_j)$ are open in $\manifold{K}$, since $f_H$ is a diffeomorphism, with the corresponding coordinate maps \begin{displaymath} \phi_{H_i^{\pm},j} : f_{H_i^{\pm}}(U_j)\to \varphi_j(U_j) : \mat{C}\mapsto \varphi_j(f_{H_i^{\pm}}^{-1}(\mat{C})). \end{displaymath} By construction the set $\{ \phi_{H_i^{\pm},j} : i = 1, ..., p_2 q_2, \pm\in\{+, -\}, j\in J \}$ is an atlas if the charts are compatible. This means we need to check if the transition maps are diffeomorphisms. Let $(\phi_{H, j}, V_j), (\phi_{\widetilde{H}, k}, V_k)$ be two arbitrary charts from our atlas, then the transition map $\phi_{\widetilde{H}, k}\circ\phi_{H,j}^{-1}:\phi_{H,j}^{-1}(V_j\cap V_k)\to\phi_{\widetilde{H},k}^{-1}(V_j\cap V_k)$ has the form \begin{displaymath} \phi_{\widetilde{H}, k}\circ\phi_{H,j}^{-1} = \varphi_k\circ f_{\widetilde{H}}^{-1}\circ f_{H}\circ\varphi_{j}^{-1} = \varphi_k\circ (\pm\mathrm{id})\circ\varphi_{j}^{-1} \end{displaymath} where $\pm$ depends on $H, \widetilde{H}$ being of the same ``sign'' and $\mathrm{id}$ is the identity. We conclude that the charts are compatible, which makes the constructed set of charts an atlas. With that we have shown the topological manifold $\manifold{K}$ with the subspace topology admit a smooth atlas that makes it an embedded smooth manifold with dimension equal to the dimension of the product topology $\manifold{A}\times\manifold{B}$; that is, $d = \dim\manifold{A} + \dim\manifold{B}$. It remains to show that the cone condition also admits a smooth manifold. $\manifold{K} = \{ \mat{A}\otimes \mat{B} : \mat{A}\in\manifold{A}, \mat{B}\in\widetilde{\manifold{B}} \}$, where $\widetilde{\manifold{B}} = \{ \mat{B}\in\manifold{B} : \|\mat{B}\|_F = 1 \}$, holds if both $\manifold{A}, \manifold{B}$ are cones. Since $g:\manifold{B}\to\mathbb{R}:\mat{B}\mapsto \|\mat{B}\|_F$ is continuous on $\manifold{B}$ with full rank $1$ everywhere, $\widetilde{\manifold{B}} = g^{-1}(1)$ is a $\dim{\manifold{B}} - 1$ dimensional embedded submanifold of $\manifold{B}$. An application of the spherical case proves the cone case. \end{proof} \begin{proof}[\hypertarget{proof:param-manifold}{Proof of \cref{thm:param-manifold}}] Induction using \cref{thm:kron-manifolds} ensures that $\manifold{K}_{\mat{B}}$ and $\manifold{K}_{\mat{\Omega}}$ are embedded submanifolds. Since $\vech : \mathbb{R}^{p^2}\to\mathbb{R}^{p(p+1)/2}$ is linear and $\manifold{K}_{\mat{\Omega}}$ is embedded we get by the linearity of $\vech$ that $\vech(\manifold{K}_{\mat{\Omega}}) = \{\vech\mat{\Omega}:\mat{\Omega}\in\manifold{K}_{\mat{\Omega}}\}$ is embedded in $\mathbb{R}^{p(p+1)/2}$. Finally, the finite product manifold of embedded submanifolds is embedded in the finite product space of their ambient spaces, that is $\Theta = \mathbb{R}^p \times \manifold{K}_{\mat{B}}\times\vech(\manifold{K}_{\mat{\Omega}}) \subset \mathbb{R}^p\times\mathbb{R}^{p\times q}\times\mathbb{R}^{p(p + 1)/2}$ is embedded. \end{proof} \begin{proof}[\hypertarget{proof:grad}{Proof of \cref{thm:grad}}] We first note that for any exponential family with density \eqref{eq:quad-density} the term $b(\mat{\eta}_{y})$ differentiated with respect to the natural parameter $\mat{\eta}_{y}$ is the expectation of the statistic $\mat{t}(\ten{X})$ given $Y = y$. In our case, we get $\nabla_{\mat{\eta}_{y}}b = (\nabla_{\mat{\eta}_{1{y}}}b, \nabla_{\mat{\eta}_2}b)$ with components \begin{displaymath} \nabla_{\mat{\eta}_{1{y}}}b = \E[\vec(\ten{X})\mid Y = y] = \vec\E[\ten{X}\mid Y = y] % = \vec\ten{g}_1(\mat{\eta}_{y}) \end{displaymath} and \begin{multline*} \nabla_{\mat{\eta}_{2}}b % &= \E[\mat{t}_2(\ten{X})\mid Y = y_i] = \E[\vech((\vec\ten{X})\t{(\vec\ten{X})})\mid Y = y] \\ = \E[\pinv{\mat{D}_p}\vec(\ten{X}\circ\ten{X})\mid Y = y] % = \pinv{\mat{D}_p}\vec\ten{g}_2(\mat{\eta}_{y}). = \pinv{\mat{D}_p}\vec\E[\ten{X}\circ\ten{X}\mid Y = y]. \end{multline*} The gradients are related to their derivatives by transposition, $\nabla_{\mat{\eta}_{1{y_i}}}b = \t{\D b(\mat{\eta}_{1{y}})}$ and $\nabla_{\mat{\eta}_2}b = \t{\D b(\mat{\eta}_2)}$. Next, we provide the differentials of the natural parameter components from \eqref{eq:eta1} and \eqref{eq:eta2} in a quite direct form, without any further ``simplifications,'' because the down-stream computations will not benefit from re-expressing \begin{align*} \d\mat{\eta}_{1{y}}(\overline{\ten{\eta}}) &= \d\vec{\overline{\ten{\eta}}}, \\ \d\mat{\eta}_{1{y}}(\mat{\beta}_j) &= \vec\Bigl( \ten{F}_{y}\mlm_{\substack{k = 1\\k\neq j}}^{r}\mat{\beta}_k\times_j\d\mat{\beta}_j \Bigr), \\ \d\mat{\eta}_2(\mat{\Omega}_j) = c\t{\mat{D}_p}\vec\d\bigotimes_{j = r}^1\mat{\Omega}_j &= c\t{\mat{D}_p}\vec\Bigl(\,\bigkron_{k = r}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_j\otimes\bigkron_{l = j - 1}^{1}\mat{\Omega}_l \Bigr). \end{align*} All other combinations, namely $\d\mat{\eta}_{1{y}}(\mat{\Omega}_j)$, $\d\mat{\eta}_2(\overline{\ten{\eta}})$ and $\d\mat{\eta}_2(\mat{\beta}_j)$, are zero. Continuing with the partial differentials of $l_n$ from \eqref{eq:log-likelihood} \begin{multline*} \d l_n(\overline{\ten{\eta}}) = \sum_{i = 1}^{n} (\langle \d\overline{\ten{\eta}}, \ten{X}_i \rangle - \D b(\mat{\eta}_{1{y_i}})\d\mat{\eta}_{1{y_i}}(\overline{\ten{\eta}})) = \sum_{i = 1}^{n} \t{(\vec{\ten{X}_i} - \vec\E[\ten{X} \mid Y = y_i])}\d\vec{\overline{\ten{\eta}}} \\ = \t{(\d\vec{\overline{\ten{\eta}}})}\vec\sum_{i = 1}^{n} (\ten{X}_i - \E[\ten{X} \mid Y = y_i]). \end{multline*} For every $j = 1, ..., r$ we get the differentials \begin{align*} \d l_n(\mat{\beta}_j) &= \sum_{i = 1}^{n} \biggl(\Bigl\langle \ten{F}_{y_i}\mlm_{\substack{k = 1\\k\neq j}}^{r}\mat{\beta}_k\times_j\d\mat{\beta}_j, \ten{X}_i \Bigr\rangle - \D b(\mat{\eta}_{1{y_i}})\d\mat{\eta}_{1{y_i}}(\mat{\beta}_j)\biggr) \\ &= \sum_{i = 1}^{n} \Bigl\langle \ten{F}_{y_i}\mlm_{\substack{k = 1\\k\neq j}}^{r}\mat{\beta}_k\times_j\d\mat{\beta}_j, \ten{X}_i - \E[\ten{X} \mid Y = y_i] \Bigr\rangle \\ &= \sum_{i = 1}^{n} \tr\biggl( \d\mat{\beta}_j\Bigl(\ten{F}_{y_i}\mlm_{\substack{k = 1\\k\neq j}}^{r}\mat{\beta}_k\Bigr)_{(j)} \t{(\ten{X}_i - \E[\ten{X} \mid Y = y_i])_{(j)}} \biggr) \\ &= \t{(\d\vec{\mat{\beta}_j})}\vec\sum_{i = 1}^{n} (\ten{X}_i - \E[\ten{X} \mid Y = y_i])_{(j)} \t{\Bigl(\ten{F}_{y_i}\mlm_{\substack{k = 1\\k\neq j}}^{r}\mat{\beta}_k\Bigr)_{(j)}} \end{align*} as well as \begin{align*} \d l_n(\mat{\Omega}_j) &= \sum_{i = 1}^{n} \biggl( c\Bigl\langle \ten{X}_i\mlm_{\substack{k = 1\\k\neq j}}^{r}\mat{\Omega}_k\times_j\d\mat{\Omega}_j, \ten{X}_i \Bigr\rangle - \D b(\mat{\eta}_2)\d\mat{\eta}_2(\mat{\Omega}_j) \biggr) \\ &= c\sum_{i = 1}^{n} \biggl( \Bigl\langle \ten{X}_i\mlm_{\substack{k = 1\\k\neq j}}^{r}\mat{\Omega}_k\times_j\d\mat{\Omega}_j, \ten{X}_i \Bigr\rangle \\ &\qquad\qquad - \t{(\pinv{\mat{D}_p}\vec\E[\ten{X}\circ\ten{X}\mid Y = y_i])}\t{\mat{D}_p}\vec\Bigl(\,\bigkron_{k = r}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_j\otimes\bigkron_{l = j - 1}^{1}\mat{\Omega}_l \Bigr) \biggr) \\ &\overset{(a)}{=} c\sum_{i = 1}^{n} \biggl( \Bigl\langle \ten{X}_i\mlm_{\substack{k = 1\\k\neq j}}^{r}\mat{\Omega}_k\times_j\d\mat{\Omega}_j, \ten{X}_i \Bigr\rangle \\ &\qquad\qquad - \t{(\vec\E[\ten{X}\circ\ten{X}\mid Y = y_i])}\vec\Bigl(\,\bigkron_{k = r}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_j\otimes\bigkron_{l = j - 1}^{1}\mat{\Omega}_l \Bigr) \biggr) \\ &= c\sum_{i = 1}^{n} \biggl( \t{\vec(\ten{X}_i\circ\ten{X}_i - \E[\ten{X}\circ\ten{X}\mid Y = y_i])}\vec\Bigl(\,\bigkron_{k = r}^{j + 1}\mat{\Omega}_k\otimes\d\mat{\Omega}_j\otimes\bigkron_{l = j - 1}^{1}\mat{\Omega}_l \Bigr) \biggr) \\ &= c\sum_{i = 1}^{n} \bigl(\ten{X}_i\otimes\ten{X}_i - \E[\ten{X}\otimes\ten{X}\mid Y = y_i]\bigr)\mlm_{\substack{k = 1\\k\neq j}}^{r}\t{(\vec{\mat{\Omega}_k})}\times_j\t{(\d\vec{\mat{\Omega}_j})} \\ &= c\t{(\d\vec{\mat{\Omega}_j})}\sum_{i = 1}^{n} \Bigl(\bigl(\ten{X}_i\otimes\ten{X}_i - \E[\ten{X}\otimes\ten{X}\mid Y = y_i]\bigr)\mlm_{\substack{k = 1\\k\neq j}}^{r}\t{(\vec{\mat{\Omega}_k})}\Bigr)_{(j)} \\ &= c\t{(\d\vec{\mat{\Omega}_j})}\vec\sum_{i = 1}^{n} \bigl(\ten{X}_i\otimes\ten{X}_i - \E[\ten{X}\otimes\ten{X}\mid Y = y_i]\bigr)\mlm_{\substack{k = 1\\k\neq j}}^{r}\t{(\vec{\mat{\Omega}_k})}. \end{align*} Equality $(a)$ holds because $\mat{N}_p = \mat{D}_p\pinv{\mat{D}_p}$ is the symmetrizer matrix \cite[][Ch.~11]{AbadirMagnus2005}. The symmetrizer matrix $\mat{N}_p$ satisfies $\mat{N}_p\vec{\mat{A}} = \vec{\mat{A}}$ if $\mat{A} = \t{\mat{A}}$, and \begin{displaymath} \vec\E[\ten{X}\circ\ten{X}\mid Y = y] = \vec\E[(\vec{\ten{X}})\t{(\vec{\ten{X}})}\mid Y = y] \end{displaymath} is the vectorization of a symmetric matrix. Finally, using the identity $\d \ten{A}(\ten{B}) = \t{(\d\vec{\ten{B}})}\nabla_{\ten{B}}\ten{A}$ identifies the partial gradients. \end{proof} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Asymptotic Normality for M-estimators on Manifolds} To proof asymptotic results, we need some technical lemmas as well as the definition of the tangent space of a manifold provided herein. We conclude this subsection with \cref{thm:M-estimator-asym-normal-on-manifolds}, a more general form of \cref{thm:asymptotic-normality-gmlm}. The following is a reformulation of Lemma~2.3 from \cite{BuraEtAl2018} which assumes Condition~2.2 to hold. %The existence of a mapping in Condition~2.2 is not needed for Lemma~2.3. It suffices that the restricted parameter space $\Theta$ is a subset of the unrestricted parameter space $\Xi$, which is trivially satisfied in our setting. Under this, \cref{thm:exists-strong-M-estimator-on-subsets} follows directly from Lemma~2.3 in \cite{BuraEtAl2018}. Condition~2.2 assumes the existence of a mapping from an open set into the unrestricted parameter space $\Xi$ such that the image of the mapping is the restricted parameter space $\Theta$. The existence of this mapping is not needed, instead the weaker assumption $\Theta\subseteq\Xi$ is sufficient with an almost identical proof. \begin{lemma}[Existence of strong M-estimators on Subsets]\label{thm:exists-strong-M-estimator-on-subsets} Assume there exists a (weak/strong) M-estimator $\hat{\mat{\xi}}_n$ for $M_n$ over $\Xi$. Then, there exists a strong M-estimator $\hat{\mat{\theta}}_n$ for $M_n$ over any non-empty $\Theta\subseteq\Xi$. \end{lemma} \begin{proof}%[\hypertarget{proof:exists-strong-M-estimator-on-subsets}{Proof of \cref{thm:exists-strong-M-estimator-on-subsets}}] Let $\hat{\mat{\xi}}_n$ be a (weak/strong) M-estimator for the unconstrained problem. This gives by definition, in any case, that \begin{displaymath} \sup_{\mat{\xi}\in\Xi} M_n(\mat{\xi}) \leq M_n(\hat{\mat{\xi}}_n) + o_P(1). \end{displaymath} Since $\emptyset\neq\Theta\subseteq\Xi$, $\sup_{\mat{\theta}\in\Theta} M_n(\mat{\theta}) \leq \sup_{\mat{\xi}\in\Xi} M_n(\mat{\xi})$ and with $M_n(\mat{\xi}) < \infty$ for any $\mat{\xi}\in\Xi$ \begin{displaymath} P\Bigl( \sup_{\mat{\theta}\in\Theta} M_n(\mat{\theta}) < \infty \Bigr) \geq P\Bigl( \sup_{\mat{\xi}\in\Xi} M_n(\mat{\xi}) < \infty \Bigr) \xrightarrow{n\to\infty} 1. \end{displaymath} If $\sup_{\mat{\theta}\in\Theta} M_n(\mat{\theta}) < \infty$, then, for any $0 < \epsilon_n$ there exists $\hat{\mat{\theta}}_n\in\Theta$ such that $\sup_{\mat{\theta}\in\Theta} M_n(\mat{\theta}) - \epsilon_n \leq M_n(\hat{\mat{\theta}}_n)$. Therefore, we can choose $\epsilon_n\in o(n^{-1})$, which yields \begin{displaymath} P\Bigl( M_n(\hat{\mat{\theta}}_n) \geq \sup_{\mat{\theta}\in\Theta} M_n(\mat{\theta}) - o(n^{-1}) \Bigr) \geq P\Bigl( \sup_{\mat{\theta}\in\Theta} M_n(\mat{\theta}) < \infty \Bigr) \xrightarrow{n\to\infty} 1. \end{displaymath} The last statement is equivalent to \begin{displaymath} M_n(\hat{\mat{\theta}}_n) \geq \sup_{\mat{\theta}\in\Theta} M_n(\mat{\theta}) - o_P(n^{-1}), \end{displaymath} which is the definition of $\hat{\mat{\theta}}_n$ being a strong M-estimator over $\Theta$. \end{proof} \begin{lemma}[Existence and Consistency of M-estimators on Subsets]\label{thm:M-estimator-consistency-on-subsets} Let $\Xi$ be a convex open subset of a Euclidean space and $\Theta\subseteq\Xi$ non-empty. Assume $\mat{\xi}\mapsto m_{\mat{\xi}}(z)$ is a strictly concave function on $\Xi$ for almost all $z$ and $z\mapsto m_{\mat{\xi}}(z)$ is measurable for all $\mat{\xi}\in\Xi$. Let $M(\mat{\xi}) = \E m_{\mat{\xi}}(Z)$ be a well defined function with a unique maximizer $\mat{\theta}_0\in\Theta\subseteq\Xi$; that is, $M(\mat{\theta}_0) > M(\mat{\xi})$ for all $\mat{\xi}\neq\mat{\theta}_0$. Also, assume \begin{displaymath} \E\sup_{\mat{\xi}\in K}|m_{\mat{\xi}}(Z)| < \infty, \end{displaymath} for every non-empty compact $K\subset\Xi$. Then, there exists a strong M-estimator $\hat{\mat{\theta}}_n$ of $M_n(\mat{\theta}) = \frac{1}{n}\sum_{i = 1}^{n} m_{\mat{\theta}}(Z_i)$ over the subset $\Theta$. Moreover, any strong M-estimator $\hat{\mat{\theta}}_n$ of $M_n$ over $\Theta$ converges in probability to $\mat{\theta}_0$, that is $\hat{\mat{\theta}}_n\xrightarrow{p}\mat{\theta}_0$. \end{lemma} \begin{proof}%[Proof of \cref{thm:M-estimator-consistency-on-subsets}] The proof follows the proof of Proposition~2.4 in \cite{BuraEtAl2018} with the same assumptions, except that we only require $\Theta$ to be a subset of $\Xi$. This is accounted for by replacing Lemma~2.3 in \cite{BuraEtAl2018} with \cref{thm:exists-strong-M-estimator-on-subsets} to obtain the existence of a strong M-estimator on $\Theta$. \end{proof} We also need the concept of a \emph{tangent space} to formulate asymptotic normality in a way that is independent of a particular coordinate representation. Intuitively, the tangent space at a point $\mat{x}\in\manifold{A}$ of the manifold $\manifold{A}$ is the hyperspace of all velocity vectors $\t{\nabla\gamma(0)}$ of any curve $\gamma:(-1, 1)\to\manifold{A}$ passing through $\mat{x} = \gamma(0)$, see \cref{fig:torus}. Locally, at $\mat{x} = \gamma(0)$ with a chart $\varphi$ we can write $\gamma(t) = \varphi^{-1}(\varphi(\gamma(t)))$ that gives $\Span\t{\nabla\gamma(0)} \subseteq \Span\t{\nabla\varphi^{-1}(\varphi(\mat{x}))}$. Taking the union over all smooth curves through $\mat{x}$ gives equality. The following definition leverages the simplified setup of smooth manifolds in Euclidean space. \begin{figure}[!hpt] \centering \includegraphics[width = 0.5\textwidth]{images/TorustangentSpace.pdf} \caption{\label{fig:torus}Visualization of the tangent space $T_{\mat{x}}\manifold{A}$ at $\mat{x}$ of the torus $\manifold{A}$. The torus $\manifold{A}$ is a 2-dimensional embedded manifold in $\mathbb{R}^3$. The tangent space $T_{\mat{x}}\manifold{A}\subset\mathbb{R}^3$ is a 2-dimensional hyperplane visualized with its origin $\mat{0}$ shifted to $\mat{x}$. Two curves $\gamma_1, \gamma_2$ with $\mat{x} = \gamma_1(0) = \gamma_2(0)$ are drawn on the torus. The curve velocity vectors $\t{\nabla\gamma_1(0)}$ and $\t{\nabla\gamma_2(0)}$ are drawn as tangent vectors with root $\mat{x}$.} \end{figure} \begin{definition}[Tangent Space]\label{def:tangent-space} Let $\manifold{A}\subseteq\mathbb{R}^p$ be an embedded smooth manifold and $\mat{x}\in\manifold{A}$. The \emph{tangent space} at $\mat{x}$ of $\manifold{A}$ is defined as \begin{displaymath} T_{\mat{x}}\manifold{A} := \Span\t{\nabla\varphi^{-1}(\varphi(\mat{x}))} \end{displaymath} for any chart $\varphi$ with $\mat{x}$ in the pre-image of $\varphi$. \end{definition} \Cref{def:tangent-space} is consistent since it can be shown that two different charts at the same point have identical span. We are now ready to provide and prove \cref{thm:M-estimator-asym-normal-on-manifolds}, a more general version of \cref{thm:asymptotic-normality-gmlm}. \begin{theorem}[\hyperlink{proof:M-estimator-asym-normal-on-manifolds}{Asymptotic Normality for M-estimators on Manifolds}]\label{thm:M-estimator-asym-normal-on-manifolds} Let $\Theta\subseteq\mathbb{R}^p$ be a smooth embedded manifold. For each $\mat{\theta}$ in a neighborhood in $\mathbb{R}^p$ of the true parameter $\mat{\theta}_0\in\Theta$ let $z\mapsto m_{\mat{\theta}}(z)$ be measurable and $\mat{\theta}\mapsto m_{\mat{\theta}}(z)$ be differentiable at $\mat{\theta}_0$ for almost all $z$. Assume also that there exists a measurable function $u$ such that $\E[u(Z)^2] < \infty$, and for almost all $z$ as well as all $\mat{\theta}_1, \mat{\theta}_2$ in a neighborhood of $\mat{\theta}_0$ such that \begin{displaymath} | m_{\mat{\theta}_1}\!(z) - m_{\mat{\theta}_2}\!(z) | \leq u(z) \| \mat{\theta}_1 - \mat{\theta}_2 \|_2. \end{displaymath} Moreover, assume that $\mat{\theta}\mapsto\E[m_{\mat{\theta}}(Z)]$ admits a second-order Taylor expansion at $\mat{\theta}_0$ in a neighborhood of $\mat{\theta}_0$ in $\mathbb{R}^p$ with a non-singular Hessian $\mat{H}_{\mat{\theta}_0} = \nabla^2_{\mat{\theta}}\E[m_{\mat{\theta}}(Z)]|_{\mat{\theta} = \mat{\theta}_0}\in\mathbb{R}^{p\times p}$. If $\hat{\mat{\theta}}_n$ is a strong M-estimator of $\mat{\theta}_0$ in $\Theta$, then $\hat{\mat{\theta}}_n$ is asymptotically normal \begin{displaymath} \sqrt{n}(\hat{\mat{\theta}}_n - \mat{\theta}_0) \xrightarrow{d} \mathcal{N}_p\left(\mat{0}, \mat{\Pi}_{\mat{\theta}_0} \E\left[\nabla_{\mat{\theta}} m_{\mat{\theta}_0}(Z)\t{(\nabla_{\mat{\theta}} m_{\mat{\theta}_0}(Z))}\right] \mat{\Pi}_{\mat{\theta}_0}\right) \end{displaymath} where $\mat{\Pi}_{\mat{\theta}_0} = \mat{P}_{\mat{\theta}_0}\pinv{(\t{\mat{P}_{\mat{\theta}_0}}\mat{H}_{\mat{\theta}_0}\mat{P}_{\mat{\theta}_0})}\t{\mat{P}_{\mat{\theta}_0}}$ and $\mat{P}_{\mat{\theta}_0}$ is any matrix whose span is the tangent space $T_{\mat{\theta}_0}\Theta$ of $\Theta$ at $\mat{\theta}_0$. \end{theorem} \begin{proof}[\hypertarget{proof:M-estimator-asym-normal-on-manifolds}{Proof of \cref{thm:M-estimator-asym-normal-on-manifolds}}] Let $\varphi:U\to\varphi(U)$ be a coordinate chart\footnote{By \cref{def:manifold}, the chart $\varphi : U\to\varphi(U)$ is bi-continuous, infinitely often continuously differentiable, and has a continuously differentiable inverse $\varphi^{-1} : \varphi(U) \to U$. Furthermore, the domain $U$ is open with respect to the trace topology on $\Theta$, which means that there exists an open set $O\subseteq\mathbb{R}^p$ such that $U = \Theta\cap O$.} with $\mat{\theta}_0\in U\subseteq\Theta$. As $\varphi$ is continuous, $\varphi(\hat{\mat{\theta}}_n)\xrightarrow{p}\varphi(\mat{\theta}_0)$ by the continuous mapping theorem on metric spaces \cite[][Theorem~18.11]{vanderVaart1998}, which implies $P(\varphi(\hat{\mat{\theta}}_n)\in\varphi(U))\xrightarrow{n\to\infty}1$. The next step is to apply Theorem~5.23 in \cite{vanderVaart1998} to $\hat{\mat{s}}_n = \varphi(\hat{\mat{\theta}}_n)$. Therefore, assume that $\hat{\mat{s}}_n\in\varphi(U)$. Denote with $\mat{s} = \varphi(\mat{\theta})\in\varphi(U)\subseteq\mathbb{R}^d$ the coordinates of the parameter $\mat{\theta}\in U\subseteq\Theta$ of the $d = \dim(\Theta)$ dimensional manifold $\Theta\subseteq\mathbb{R}^p$. Since $\varphi : U\to\varphi(U)$ is bijective, we can express $m_{\mat{\theta}}$ in terms of $\mat{s} = \varphi(\mat{\theta})$ as $m_{\mat{\theta}} = m_{\varphi^{-1}(\mat{s})}$ for every $\mat{\theta}\in U$. Furthermore, we let \begin{displaymath} M(\mat{\theta}) = \E[m_{\mat{\theta}}(Z)] \qquad\text{and}\qquad M_{\varphi}(\mat{s}) = \E[m_{\varphi^{-1}(\mat{s})}(Z)] = M(\varphi^{-1}(\mat{s})). \end{displaymath} \begin{figure}[H] \centering \includegraphics{images/embeddImage.pdf} \caption{\label{fig:proof:M-estimator-asym-normal-on-manifolds}Depiction of the notation used in the proof of \cref{thm:M-estimator-asym-normal-on-manifolds}. Example with $p = 3$ and $d = \dim(\Theta) = 2$.} \end{figure} By assumption, the function $M(\mat{\theta})$ is twice continuously differentiable in a neighborhood\footnote{A set $N$ is called a neighborhood of $u$ if there exists an open set $O$ such that $u\in O\subseteq N$.} of $\mat{\theta}_0$. Without loss of generality, we can assume that $U$ is contained in that neighborhood. Then, using the chain rule, we compute the gradient of $M_{\varphi}$ at $\mat{s}_0$ to be $\mat{0}$, \begin{displaymath} \nabla M_{\varphi}(\mat{s}_0) = {\nabla\varphi^{-1}(\mat{s}_0)}{\nabla M(\varphi^{-1}(\mat{s}_0))} = {\nabla\varphi^{-1}(\mat{s}_0)}{\nabla M(\mat{\theta}_0)} = {\nabla\varphi^{-1}(\mat{s}_0)}\mat{0} = \mat{0} \end{displaymath} because $\mat{\theta}_0 = \varphi^{-1}(\mat{s}_0)$ is a maximizer of $M$. The second-derivative of $M$, evaluated at $\mat{s}_0 = \varphi(\mat{\theta}_0)$ and using $\nabla M_{\varphi}(\mat{s}_0) = \mat{0}$, is \begin{displaymath} \nabla^2 M_{\varphi}(\mat{s}_0) = \nabla\varphi^{-1}(\mat{s}_0)\nabla^2 M(\varphi^{-1}(\mat{s}_0))\t{\nabla\varphi^{-1}(\mat{s}_0)} = \nabla\varphi^{-1}(\mat{s}_0)\mat{H}_{\mat{\theta}_0}\t{\nabla\varphi^{-1}(\mat{s}_0)}. \end{displaymath} This gives the second-order Taylor expansion of $M_{\varphi}$ at $\mat{s}_0$ as \begin{displaymath} M_{\varphi}(\mat{s}) = M_{\varphi}(\mat{s}_0) + \frac{1}{2}\t{(\mat{s} - \mat{s}_0)} \nabla^2 M_{\varphi}(\mat{s}_0) (\mat{s} - \mat{s}_0) + \mathcal{O}(\|\mat{s} - \mat{s}_0\|^3) \end{displaymath} We also need to check the local Lipschitz condition of $m_{\varphi^{-1}(\mat{s})}$. Let $V_{\epsilon}(\mat{s}_0)$ be the open $\epsilon$-ball with center $\mat{s}_0$; i.e., $V_{\epsilon}(\mat{s}_0) = \{ \mat{s}\in\mathbb{R}^d : \|\mat{s} - \mat{s}_0\| < \epsilon \}$. Since $\varphi(U)$ contains $\mat{s}_0$, and is open in $\mathbb{R}^d$, there exists an $\epsilon > 0$ such that $V_{\epsilon}(\mat{s}_0)\subseteq\varphi(U)$. The closed $\epsilon/2$ ball $\overline{V}_{\epsilon / 2}(\mat{s}_0)$ is a neighborhood of $\mat{s}_0$ and $\sup_{\mat{s}\in \overline{V}_{\epsilon / 2}(\mat{s}_0)}\|\nabla\varphi^{-1}(\mat{s})\| < \infty$ due to the continuity of $\nabla\varphi^{-1}$ on $\varphi(U)$ with $\overline{V}_{\epsilon / 2}(\mat{s}_0)\subset V_{\epsilon}(\mat{s}_0)\subseteq\varphi(U)$. Then, for almost every $z$ and every $\mat{s}_1 = \varphi(\mat{\theta}_1), \mat{s}_2 = \varphi(\mat{\theta}_2)\in\overline{V}_{\epsilon / 2}(\mat{s}_0)$, \begin{multline*} | m_{\varphi^{-1}(\mat{s}_1)}(z) - m_{\varphi^{-1}(\mat{s}_2)}(z) | = | m_{\mat{\theta}_1}(z) - m_{\mat{\theta}_2}(z) | \overset{(a)}{\leq} u(z) \| \mat{\theta}_1 - \mat{\theta}_2 \| \\ = u(z) \| \varphi^{-1}(\mat{s}_1) - \varphi^{-1}(\mat{s}_2) \| \overset{(b)}{\leq} u(z) \sup_{\mat{s}\in \overline{V}_{\epsilon / 2}(\mat{s}_0)}\|\nabla\varphi^{-1}(\mat{s})\| \|\mat{s}_1 - \mat{s}_2\| =: v(z) \|\mat{s}_1 - \mat{s}_2\|. \end{multline*} Here, $(a)$ holds by assumption and $(b)$ is a result of the mean value theorem. Now, $v(z)$ is measurable and square integrable as a scaled version of $u(z)$. Finally, since $\varphi$ is one-to-one, $\hat{\mat{s}}_n = \varphi(\hat{\mat{\theta}}_n)$ is a strong M-estimator for $\mat{s}_0 = \varphi(\mat{\theta}_0)$ of the objective $M_{\varphi}$. We next apply Theorem~5.23 in \cite{vanderVaart1998} to obtain the asymptotic normality of $\hat{\mat{s}}_n$, \begin{displaymath} \sqrt{n}(\hat{\mat{s}}_n - \mat{s}_0) \xrightarrow{d} \mathcal{N}_{d}(0, \mat{\Sigma}_{\mat{s}_0}), \end{displaymath} where the $d\times d$ variance-covariance matrix $\mat{\Sigma}_{\mat{s}_0}$ is given by \begin{align*} \mat{\Sigma}_{\mat{s}_0} &= (\nabla^2 M_{\varphi}(\mat{s}_0))^{-1}\E[\nabla_{\mat{s}} m_{\varphi^{-1}(\mat{s}_0)}(Z)\t{(\nabla_{\mat{s}} m_{\varphi^{-1}(\mat{s}_0)}(Z))}](\nabla^2 M_{\varphi}(\mat{s}_0))^{-1}. \end{align*} { \def\PP{\mat{\varPhi}_{\mat{\theta}_0}} \def\EE#1#2{\E[\nabla_{#2} m_{#1}(Z)\t{(\nabla_{#2} m_{#1}(Z))}]} Applying the delta method, we obtain \begin{displaymath} \sqrt{n}(\hat{\mat{\theta}}_n - \mat{\theta}_0)= \sqrt{n}(\varphi^{-1}(\hat{\mat{s}}_n) - \varphi^{-1}(\mat{s}_0)) \xrightarrow{d} \mathcal{N}_p(0, \t{\nabla\varphi^{-1}(\mat{s}_0)}\mat{\Sigma}_{\mat{s}_0}{\nabla\varphi^{-1}(\mat{s}_0)}). \end{displaymath} We continue by reexpressing the $p\times p$ asymptotic variance-covariance matrix of $\hat{\mat{\theta}}_n$ in terms of $\mat{\theta}_0$ instead of $\mat{s}_0 = \varphi(\mat{\theta}_0)$. We let $\PP = \t{\nabla\varphi^{-1}(\varphi(\mat{\theta}_0))} = \t{\nabla\varphi^{-1}(\mat{s}_0)}$ and observe that for all $\mat{s}\in\varphi(U)$, the gradient of $\mat{s}\mapsto m_{\varphi^{-1}(\mat{s})}(z)$ evaluated at $\mat{s}_0 = \varphi(\mat{\theta}_0)$ has the form \begin{displaymath} \nabla_{\mat{s}}m_{\varphi^{-1}(\mat{s}_0)}(z)= \nabla\varphi^{-1}(\mat{s}_0)\nabla_{\mat{\theta}}m_{\mat{\theta}_0}(z)= \t{\PP}\nabla_{\mat{\theta}}m_{\mat{\theta}_0}(z). \end{displaymath} Then, \begin{multline*} \t{\nabla\varphi^{-1}(\mat{s}_0)}\mat{\Sigma}_{\mat{s}_0}{\nabla\varphi^{-1}(\mat{s}_0)}= \PP\mat{\Sigma}_{\mat{s}_0}\t{\PP} \\ = \PP(\nabla^2 M_{\varphi}(\mat{s}_0))^{-1}\EE{\varphi^{-1}(\mat{s}_0)}{}(\nabla^2 M_{\varphi}(\mat{s}_0))^{-1}\t{\PP} \\ = {\PP}(\t{\PP}\mat{H}_{\mat{\theta}_0}\PP)^{-1}\t{\PP}\EE{\mat{\theta}_0}{\mat{\theta}}{\PP}(\t{\PP}\mat{H}_{\mat{\theta}_0}\PP)^{-1}\t{\PP} \\ = \mat{\Pi}_{\mat{\theta}_0}\EE{\mat{\theta}_0}{\mat{\theta}}\mat{\Pi}_{\mat{\theta}_0} \end{multline*} where the last equality holds because $\Span\PP = T_{\mat{\theta}_0}\Theta$, by \cref{def:tangent-space} of the tangent space $T_{\mat{\theta}_0}\Theta$. It remains to show that $\mat{\Pi}_{\mat{\theta}_0} = \mat{P}_{\mat{\theta}_0}\pinv{(\t{\mat{P}_{\mat{\theta}_0}}\mat{H}_{\mat{\theta}_0}\mat{P}_{\mat{\theta}_0})}\t{\mat{P}_{\mat{\theta}_0}}$ for any $p\times k$ matrix $\mat{P}_{\mat{\theta}_0}$ such that $k\geq d$ and $\Span{\mat{P}_{\mat{\theta}_0}} = T_{\mat{\theta}_0}\Theta$. This also ensures that the final result is independent of the chosen chart $\varphi$, since the tangent space does not depend on a specific chart. Therefore, let $\PP = {\mat{Q}}{\mat{R}}$ and $\mat{P}_{\mat{\theta}_0} = \widetilde{\mat{Q}}\widetilde{\mat{R}}$ be their thin QR decompositions, respectively. Both $\mat{Q}, \widetilde{\mat{Q}}$ have dimension $p\times d$, $\mat{Q}$ is semi-orthogonal, and $\mat{R}$ is invertible of dimension $d\times d$ while $\widetilde{\mat{R}}$ is a $d\times k$ full row-rank matrix. Since $\mat{Q}$ is semi-orthogonal, the $p\times p$ matrix $\mat{Q}\t{\mat{Q}}$ is an orthogonal projection onto $\Span\mat{Q} = \Span\mat{P}_{\mat{\theta}_0} = T_{\mat{\theta}_0}\Theta$. This allows to express $\mat{P}_{\mat{\theta}_0}$ in terms of $\mat{Q}$ as \begin{displaymath} \mat{P}_{\mat{\theta}_0} = \mat{Q}\t{\mat{Q}}\mat{P}_{\mat{\theta}_0} = \mat{Q}\t{\mat{Q}}\widetilde{\mat{Q}}\widetilde{\mat{R}} =: {\mat{Q}}\mat{M}. \end{displaymath} From $\Span\mat{Q} = \Span\mat{P}_{\mat{\theta}_0}$, it follows that the $d\times k$ matrix $\mat{M}$ is also of full row-rank. Also, $\mat{M}\pinv{\mat{M}} = \mat{I}_d = \mat{R}\mat{R}^{-1}$ as a property of the Moore-Penrose pseudo inverse with $\mat{M}$ being of full row-rank. Another property of the pseudo inverse is that for matrices $\mat{A}, \mat{B}$, where $\mat{A}$ has full column-rank and $\mat{B}$ has full row-rank, $\pinv{(\mat{A}\mat{B})} = \pinv{\mat{B}}\pinv{\mat{A}}$. This enables the computation \begin{multline*} \mat{P}_{\mat{\theta}_0}\pinv{(\t{\mat{P}_{\mat{\theta}_0}}\mat{H}_{\mat{\theta}_0}\mat{P}_{\mat{\theta}_0})}\t{\mat{P}_{\mat{\theta}_0}}= \mat{Q} \mat{M} \pinv{\mat{M}} (\t{\mat{Q}} \mat{H}_{\mat{\theta}_0} \mat{Q})^{-1} \t{(\mat{M} \pinv{\mat{M}})} \t{\mat{Q}} \\ = \mat{Q} {\mat{R}} {\mat{R}}^{-1} (\t{\mat{Q}} \mat{H}_{\mat{\theta}_0} \mat{Q})^{-1} \t{({\mat{R}} {\mat{R}}^{-1})} \t{\mat{Q}} = \PP(\t{\PP}\mat{H}_{\mat{\theta}_0}\PP)^{-1}\t{\PP} = \mat{\Pi}_{\mat{\theta}_0}. \end{multline*} } \end{proof} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Asymptotic Normality of GMLM} We rewrite the log-likelihood \eqref{eq:log-likelihood} in a different form to simplify the proof of \cref{thm:asymptotic-normality-gmlm} and to provide the notation to express the regularity conditions of \cref{thm:asymptotic-normality-gmlm} in a compact form. The first natural parameter component $\mat{\eta}_{1y}$ defined in \eqref{eq:eta1-manifold} can be written as \begin{align*} \mat{\eta}_{1y} &= \vec{\overline{\ten{\eta}}} + \mat{B}\vec{\ten{F}_y}= \mat{I}_p\vec{\overline{\ten{\eta}}} + (\t{(\vec{\ten{F}_y})}\otimes\mat{I}_p)\vec{\mat{B}} \\ &= \begin{pmatrix} \mat{I}_p & \t{(\vec{\ten{F}_y})}\otimes\mat{I}_p \end{pmatrix}\begin{pmatrix} \vec{\overline{\ten{\eta}}} \\ \vec{\mat{B}} \end{pmatrix}. \end{align*} The second natural parameter component $\mat{\eta}_2$, modeled in \eqref{eq:eta2-manifold}, relates to $\vech{\mat{\Omega}}$ linearly as \begin{displaymath} \mat{\eta}_2 = c\t{\mat{D}_p}\vec{\mat{\Omega}} = c\t{\mat{D}_p}\mat{D}_p\vech{\mat{\Omega}}. \end{displaymath} This gives the following relation between $\mat{\eta}_y = (\mat{\eta}_{1y}, \mat{\eta}_2)$ and $\mat{\xi} = (\vec{\overline{\ten{\eta}}}, \vec{\mat{B}}, \vech{\mat{\Omega}})\in\Xi$, \begin{equation} \mat{\eta}_y = \begin{pmatrix} \mat{I}_p & \ \ \ \t{(\vec{\ten{F}_y})}\otimes\mat{I}_p\ \ & 0 \\ 0 & 0 & c\t{\mat{D}_p}\mat{D}_p \end{pmatrix} \begin{pmatrix} \vec{\overline{\ten{\eta}}} \\ \vec{\mat{B}} \\ \vech{\mat{\Omega}} \end{pmatrix} =: \mat{F}(y)\mat{\xi} \label{eq:eta-to-xi-linear-relation} \end{equation} where $\mat{F}(y)$ is a $p (p + 3) / 2\times p (p + 2 q + 3) / 2$ dimensional matrix-valued function in $y$. Moreover, for every $y$ the matrix $\mat{F}(y)$ is of full rank. The log-likelihood of model \eqref{eq:quad-density} for the unconstrained parameters $\xi\in\Xi$ is \begin{displaymath} l_n(\mat{\xi}) = \frac{1}{n}\sum_{i = 1}^{n} (\langle \mat{t}(\ten{X}), \mat{\eta}_{y} \rangle - b(\mat{\eta}_y)) =: \frac{1}{n}\sum_{i = 1}^{n} m_{\mat{\xi}}(Z_i) \end{displaymath} where $Z_i = (\ten{X}_i, Y_i)$. Using \eqref{eq:eta-to-xi-linear-relation} we can write \begin{displaymath} m_{\mat{\xi}}(z) = \langle\mat{t}(\ten{X}), \mat{F}(y)\mat{\xi}\rangle - b(\mat{F}(y)\mat{\xi}). \end{displaymath} The following are the regularity conditions for the log-likelihood required by \cref{thm:asymptotic-normality-gmlm}. \begin{condition}\label{cond:differentiable-and-convex} The mapping $\mat{\xi}\mapsto m_{\mat{\xi}}(z)$ is twice continuously differentiable for almost every $z$ and $z\mapsto m_{\mat{\xi}}(z)$ is measurable. Moreover, $\mat{\eta}\mapsto b(\mat{\eta})$ is strictly convex. Furthermore, for every $\widetilde{\mat{\eta}}$, $P(\mat{F}(Y)\mat{\xi} = \widetilde{\mat{\eta}}) < 1$. \end{condition} \begin{condition}\label{cond:moments} $\E\|\t{\mat{t}(\ten{X})}\mat{F}(Y)\| < \infty$, and $\E\|\t{\mat{t}(\ten{X})}\mat{F}(Y)\|^2 < \infty$. \end{condition} \begin{condition}\label{cond:finite-sup-on-compacta} The mapping $\mat{\eta}\mapsto b(\mat{\eta})$ is twice continuously differentiable and for every non-empty compact $K\subseteq\Xi$, \begin{gather*} \E\sup_{\mat{\xi}\in K}\|b(\mat{F}(Y)\mat{\xi})\| < \infty, \qquad \E\sup_{\mat{\xi}\in K}\|\t{\nabla b(\mat{F}(Y)\mat{\xi})}\mat{F}(Y)\|^2 < \infty, \\ \E\sup_{\mat{\xi}\in K}\| \t{\mat{F}(Y)}\nabla^2 b(\mat{F}(Y)\mat{\xi})\mat{F}(Y) \| < \infty. \end{gather*} \end{condition} We continue with some more technical lemmas needed for the proof of \cref{thm:asymptotic-normality-gmlm}. \begin{lemma}\label{thm:kron-perm} Given $r \geq 2$ matrices $\mat{A}_k$ of dimension $p_j\times q_j$ for $k = 1, \ldots, r$, then there exists a unique permutation matrix $\mat{S}_{\mat{p}, \mat{q}}$ such that \begin{equation}\label{eq:kron-to-outer-perm} \vec\bigkron_{k = r}^{1}\mat{A}_k = \mat{S}_{\mat{p}, \mat{q}}\vec\bigouter_{k = 1}^{r}\mat{A}_k. \end{equation} The permutation $\mat{S}_{\mat{p}, \mat{q}}$ with indices $\mat{p} = (p_1, \ldots, p_r)$ and $\mat{q} = (q_1, \ldots, q_r)$ is defined recursively as \begin{equation}\label{eq:S_pq} \mat{S}_{\mat{p}, \mat{q}} = \mat{S}_{\bigl( \prod_{k = 1}^{r - 1}p_k, p_r \bigr), \bigl( \prod_{k = 1}^{r - 1}q_k, q_r \bigr)} \bigl(\mat{I}_{p_r q_r}\otimes\mat{S}_{(p_1, \ldots, p_{r-1}), (q_1, \ldots, q_{r-1})}\bigr) \end{equation} with initial value \begin{displaymath} \mat{S}_{(p_1, p_2), (q_1, q_2)} = \mat{I}_{q_2}\otimes\mat{K}_{q_1, p_2}\otimes\mat{I}_{p_1} \end{displaymath} where $\mat{K}_{p, q}$ is the \emph{commutation matrix} \cite[][Ch.~11]{AbadirMagnus2005}; that is, the permutation such that $\vec{\t{\mat{A}}} = \mat{K}_{p, q}\vec{\mat{A}}$ for every $p\times q$ dimensional matrix $\mat{A}$. \end{lemma} \begin{proof} Lemma~7 in \cite{MagnusNeudecker1986} states that \begin{align} \vec(\mat{A}_2\otimes\mat{A}_1) &= (\mat{I}_{q_2}\otimes\mat{K}_{q_1, p_2}\otimes\mat{I}_{p_1})(\vec{\mat{A}_2}\otimes\vec{\mat{A}_1}) \label{eq:MagnusNeudecker1986-vec-kron-identity} \\ &= (\mat{I}_{q_2}\otimes\mat{K}_{q_1, p_2}\otimes\mat{I}_{p_1})\vec(\mat{A}_1\circ \mat{A}_2). \nonumber \end{align} This proves the statement for $r = 2$. The general statement for $r > 2$ follows by induction. Assuming \eqref{eq:kron-to-outer-perm} holds for $r - 1$, the induction step is \begin{multline*} \vec{\bigkron_{k = r}^{1}}\mat{A}_k = \vec\Bigl(\mat{A}_r\otimes\bigkron_{k = r - 1}^{1}\mat{A}_k\Bigr) \\ \overset{\eqref{eq:MagnusNeudecker1986-vec-kron-identity}}{=} \Bigl( \mat{I}_{q_r}\otimes\mat{K}_{\prod_{k = 1}^{r - 1}q_k, p_r}\otimes\mat{I}_{\prod_{k = 1}^{r - 1}p_k} \Bigr)\vec\Bigl((\vec\mat{A}_r)\otimes\vec\bigkron_{k = r - 1}^{1}\mat{A}_k\Bigr) \\ = \mat{S}_{\bigl( \prod_{k = 1}^{r - 1}p_k, p_r \bigr), \bigl( \prod_{k = 1}^{r - 1}q_k, q_r \bigr)}\vec\Bigl[\Bigl(\vec\bigkron_{k = r - 1}^{1}\mat{A}_k\Bigr)\t{(\vec\mat{A}_r)}\Bigr] \\ \overset{\eqref{eq:kron-to-outer-perm}}{=} \mat{S}_{\bigl( \prod_{k = 1}^{r - 1}p_k, p_r \bigr), \bigl( \prod_{k = 1}^{r - 1}q_k, q_r \bigr)}\vec\Bigl[\mat{S}_{(p_1, \ldots, p_{r-1}), (q_1, \ldots, q_{r-1})}\Bigl(\vec\bigouter_{k = 1}^{r - 1}\mat{A}_k\Bigr)\t{(\vec\mat{A}_r)}\Bigr] \\ \overset{(a)}{=} \mat{S}_{\bigl( \prod_{k = 1}^{r - 1}p_k, p_r \bigr), \bigl( \prod_{k = 1}^{r - 1}q_k, q_r \bigr)} \bigl(\mat{I}_{p_rq_r}\otimes\mat{S}_{(p_1, \ldots, p_{r-1}), (q_1, \ldots, q_{r-1})}\bigr)\vec\Bigl[\Bigl(\vec\bigouter_{k = 1}^{r - 1}\mat{A}_k\Bigr)\t{(\vec\mat{A}_r)}\Bigr] \\ =\mat{S}_{\mat{p},\mat{q}}\vec\bigouter_{k = 1}^{r}\mat{A}_k. \end{multline*} Equality $(a)$ uses the relation $\vec(\mat{C}\mat{a}\t{\mat{b}}) = (\mat{I}_{\dim(\mat{b})}\otimes\mat{C})\vec(\mat{a}\t{\mat{b}})$ for a matrix $\mat{C}$ and vectors $\mat{a}, \mat{b}$. \end{proof} \begin{lemma}\label{thm:kron-manifold-tangent-space} Let $\manifold{A}_k\subseteq\mathbb{R}^{p_k\times q_k}\backslash\{\mat{0}\}$ for $k = 1, \ldots, r$ be smooth embedded submanifolds as well as ether a sphere or a cone. Then \begin{displaymath} \manifold{K} = \Bigl\{ \bigkron_{k = r}^{1}\mat{A}_k : \mat{A}_k\in\manifold{A}_k \Bigr\} \end{displaymath} is an embedded manifold in $\mathbb{R}^{p\times q}$ for $p = \prod_{k = 1}^{r} p_k$ and $q = \prod_{k = 1}^{r} q_k$. Furthermore, define for $j = 1, \ldots, r$ the matrices \begin{equation}\label{eq:kron-differential-span} \mat{\Gamma}_j = \bigkron_{k = r}^{1}(\mat{I}_{p_k q_k}\mathrm{\ if\ } j = k \mathrm{\ else\ }\vec{\mat{A}_k}) = \bigkron_{k = r}^{j + 1}(\vec{\mat{A}_k})\otimes\mat{I}_{p_j q_j}\otimes\bigkron_{k = j - 1}^{1}(\vec{\mat{A}_k}) \end{equation} and let $\gamma_j$ be $p_j q_j\times d_j$ matrices with $d_j \geq\dim\manifold{A}_j$ which span the tangent space $T_{\mat{A}_j}\manifold{A}_j$ of $\manifold{A}$ at $\mat{A}_j\in\manifold{A}_j$, that is $\Span\gamma_j = T_{\mat{A}_j}\manifold{A}_j$. Then, with the permutation matrix $\mat{S}_{\mat{p}, \mat{q}}$ defined in \eqref{eq:S_pq}, the $p q \times \sum_{k = 1}^{r} d_j$ dimensional matrix \begin{displaymath} \mat{P}_{\mat{A}} = \mat{S}_{\mat{p}, \mat{q}}\left[\mat{\Gamma}_1\mat{\gamma}_1, \mat{\Gamma}_2\mat{\gamma}_2, \ldots, \mat{\Gamma}_r\mat{\gamma}_r\right] \end{displaymath} spans the tangent space $T_{\mat{A}}\manifold{K}$ of $\manifold{K}$ at $\mat{A} = \bigkron_{k = r}^{1}\mat{A}_k\in\manifold{K}$, in formula $\Span\mat{P}_{\mat{A}} = T_{\mat{A}}\manifold{K}$. \end{lemma} \begin{proof} The statement that $\manifold{K}$ is an embedded manifold follows via induction using \cref{thm:kron-manifolds}. We compute the differential of the vectorized Kronecker product using \cref{thm:kron-perm} where $\mat{S}_{\mat{p}, \mat{q}}$ is the permutation \eqref{eq:S_pq} defined therein. \begin{multline*} \d\vec\bigotimes_{k = r}^{1}\mat{A}_k = \vec\sum_{j = 1}^{r}\bigkron_{k = r}^{1}(\ternary{k = j}{\d\mat{A}_j}{\mat{A}_k}) \\ = \mat{S}_{\mat{p},\mat{q}}\vec\sum_{j = 1}^{r}\Bigl(\bigouter_{k = 1}^{r}(\ternary{k = j}{\d\mat{A}_j}{\mat{A}_k})\Bigr) \\ = \mat{S}_{\mat{p}, \mat{q}}\sum_{j = 1}^{r}\bigkron_{k = r}^{1}(\ternary{k = j}{\vec\d\mat{A}_j}{\vec\mat{A}_k}) \\ =\mat{S}_{\mat{p}, \mat{q}}\sum_{j =1}^{r}\Bigl(\bigkron_{k = r}^{1}(\ternary{k = j}{\mat{I}_{p_j q_j}}{\vec\mat{A}_k})\Bigr)\vec\d\mat{A}_j \\ = \mat{S}_{\mat{p}, \mat{q}}\sum_{j = 1}^{r}\mat{\Gamma}_j\vec\d\mat{A}_j = \mat{S}_{\mat{p}, \mat{q}}[\mat{\Gamma}_1, \ldots, \mat{\Gamma}_r]\begin{pmatrix} \vec\d\mat{A}_1 \\ \vdots \\ \vec\d\mat{A}_r \end{pmatrix} \end{multline*} Due to the definition of the manifold, this differential provides the gradient of a surjective map into the manifold. The span of the gradient then spans the tangent space. Now, we take a closer look at the differentials $\vec{\d\mat{A}_j}$ for $j = 1, \ldots, r$. Let $\varphi_j$ be a chart of $\manifold{A}_j$ in a neighborhood of $\mat{A}_j$. Then, $\mat{A}_j = \varphi_j^{-1}(\varphi_j(\mat{A}_j))$ which gives \begin{displaymath}\vec{\d\mat{A}_j} = \t{\nabla\varphi_j^{-1}(\varphi_j(\mat{A}_j))}\vec\d\varphi_j(\mat{A}_j). \end{displaymath} Therefore, for every matrix $\mat{\gamma}_j$ such that $\Span{\mat{\gamma}_j} = T_{\mat{A}_j}\manifold{A}_j$ holds $\Span{\t{\nabla\varphi_j^{-1}(\varphi_j(\mat{A}_j))}} = \Span{\mat{\gamma}_j}$ by \cref{def:tangent-space} of the tangent space. We get \begin{displaymath} \Span\mat{S}_{\mat{p}, \mat{q}}[\mat{\Gamma}_1, \ldots, \mat{\Gamma}_r]\begin{pmatrix} \vec\d\mat{A}_1 \\ \vdots \\ \vec\d\mat{A}_r \end{pmatrix} =\Span\mat{S}_{\mat{p}, \mat{q}}[\mat{\Gamma}_1\mat{\gamma}_1, \ldots, \mat{\Gamma}_r\mat{\gamma}_r] =\Span\mat{P}_{\mat{A}} \end{displaymath} which concludes the proof. \end{proof} \begin{proof}[\hypertarget{proof:asymptotic-normality-gmlm}{Proof of \cref{thm:asymptotic-normality-gmlm}}] The proof consists of three parts. First, we show the existence of a consistent strong M-estimator by applying \cref{thm:M-estimator-consistency-on-subsets}. Next, we apply \cref{thm:M-estimator-asym-normal-on-manifolds} to obtain its asymptotic normality. We conclude by computing the missing parts of the asymtotic covariance matrix $\mat{\Sigma}_{\mat{\theta}_0}$ provided by \cref{thm:M-estimator-asym-normal-on-manifolds}. We check whether the conditions of \cref{thm:M-estimator-consistency-on-subsets} are satisfied. On $\Xi$, the mapping $\mat{\xi}\mapsto m_{\mat{\xi}}(z) = m_{\mat{\xi}}(\ten{X},y) = \langle \mat{F}(y)\mat{\xi}, \mat{t}(\ten{X}) \rangle - b(\mat{F}(y)\mat{\xi})$ is strictly concave for every $z$ because $\mat{\xi}\mapsto\mat{F}(y)\mat{\xi}$ is linear and $b$ is strictly convex by \cref{cond:differentiable-and-convex}. Since $\ten{X} \mid Y$ is distributed according to \eqref{eq:quad-density}, the function $M(\mat{\xi}) = \E m_{\mat{\xi}}(Z)$ is well defined by \cref{cond:moments}. Let $\mat{\xi}_k = (\vec{\overline{\ten{\eta}}_k}, \vec{\mat{B}_k}, \vech{\mat{\Omega}_k})$, and $f_{\mat{\xi}_k}$ be the pdf of $\ten{X} \mid Y$ indexed by $\mat{\xi}_k$, for $k = 1, 2$. If $\mat{\xi}_1\ne \mat{\xi}_2$, then $f_{\mat{\xi}_1} \neq f_{\mat{\xi}_2}$, which obtains that the true $\mat{\theta}_0$ is a unique maximizer of $\mat{\theta}_0\in\Theta\subseteq\Xi$ by applying Lemma~5.35 from \cite{vanderVaart1998}. Finally, under \cref{cond:finite-sup-on-compacta}, all assumptions of \cref{thm:M-estimator-consistency-on-subsets} are fulfilled yielding the existence of a consistent strong M-estimator over $\Theta\subseteq\Xi$. Next, let $\hat{\mat{\theta}}_n$ be a strong M-estimator on $\Theta\subseteq\Xi$, whose existence and consistency was shown in the previous step. Since $z\mapsto m_{\mat{\xi}}(z)$ is measurable for all $\mat{\xi}\in\Xi$, it is also measurable in a neighborhood of $\mat{\theta}_0$. The differentiability of $\mat{\theta}\mapsto m_{\mat{\theta}}(z)$ is stated in \cref{cond:differentiable-and-convex}. For the Lipschitz condition, let $K\subseteq\Xi$ be a compact neighborhood of $\mat{\theta}_0$, which exists since $\Xi$ is open. Then, \begin{multline*} \left| m_{\mat{\theta}_1}(z) - m_{\mat{\theta}_2}(z) \right| = \left| \langle \mat{t}(\ten{X}), \mat{F}(y)(\mat{\theta}_1 - \mat{\theta}_2) \rangle - b(\mat{F}(z)\mat{\theta}_1) + b(\mat{F}(z)\mat{\theta}_2) \right| \\ \leq (\| \t{\mat{F}(y)}\mat{t}(\ten{X}) \|_2 + \sup_{\mat{\theta}\in K}\| \nabla b(\mat{F}(y)\mat{\theta}) \mat{F}(y)\| ) \| \mat{\theta}_1 - \mat{\theta}_2 \|_2 =: u(z)\| \mat{\theta}_1 - \mat{\theta}_2 \|_2 \end{multline*} with $u(z)$ being measurable and square integrable derives from \cref{cond:finite-sup-on-compacta}. The existence of a second-order Taylor expansion of $\mat{\theta}\mapsto M(\mat{\theta}) = \E m_{\mat{\theta}}(Z)$ in a neighborhood of $\mat{\theta}_0$ holds by \cref{cond:finite-sup-on-compacta}. Moreover, the Hessian $\mat{H}_{\mat{\theta}_0}$ is non-singular by the strict convexity of $b$ stated in \cref{cond:differentiable-and-convex}. Now, we can apply \cref{thm:M-estimator-asym-normal-on-manifolds} to obtain the asymptotic normality of $\sqrt{n}(\hat{\mat{\theta}}_n - \mat{\theta}_0)$ with variance-covariance structure \begin{equation}\label{eq:asymptotic-covariance-gmlm} \mat{\Sigma}_{\mat{\theta}_0} = \mat{\Pi}_{\mat{\theta}_0} \E[\nabla m_{\mat{\theta}_0}(Z)\t{(\nabla m_{\mat{\theta}_0}(Z))}]\mat{\Pi}_{\mat{\theta}_0} \end{equation} where $\mat{\Pi}_{\mat{\theta}_0} = \mat{P}_{\mat{\theta}_0}(\t{\mat{P}_{\mat{\theta}_0}}\mat{H}_{\mat{\theta}_0}\mat{P}_{\mat{\theta}_0})^{-1}\t{\mat{P}_{\mat{\theta}_0}}$ and $\mat{P}_{\mat{\theta}_0}$ is any $p\times \dim(\Theta)$ matrix such that it spans the tangent space of $\Theta$ at $\mat{\theta}_0$. That is, $\Span \mat{P}_{\mat{\theta}_0} = T_{\mat{\theta}_0}\Theta$. Finally, we compute a matrix $\mat{P}_{\mat{\theta}_0}$ such that $\Span{\mat{P}_{\mat{\theta}_0}} = T_{\mat{\theta}_0}\Theta$ for $\Theta = \mathbb{R}^p\times\manifold{K}_{\mat{B}}\times\vech(\manifold{K}_{\mat{\Omega}})$ as in \cref{thm:param-manifold}. Since the manifold $\Theta$ is a product manifold we get a block diagonal structure for $\mat{P}_{\mat{\theta}_0}$ as \begin{displaymath} \mat{P}_{\mat{\theta}_0} = \begin{pmatrix} \mat{I}_p & 0 & 0 \\ 0 & \mat{P}_{\mat{B}_0} & 0 \\ 0 & 0 & \mat{P}_{\mat{\Omega}_0} \end{pmatrix} \end{displaymath} where $\mat{I}_p$ is the identity matrix spanning the tangent space of $\mathbb{R}^p$, which is identified with $\mathbb{R}^p$ itself. The blocks $\mat{P}_{\mat{B}_0}$ and $\mat{P}_{\mat{\Omega}_0}$ need to span the tangent spaces of $\manifold{K}_{\mat{B}}$ and $\vech(\manifold{K}_{\mat{\Omega}})$, respectively. Both $\manifold{K}_{\mat{B}}$ and $\vech(\manifold{K}_{\mat{\Omega}})$ are manifolds according to \cref{thm:param-manifold}. The manifold $\vech(\manifold{K}_{\mat{\Omega}})$ is the image of $\manifold{K}_{\mat{\Omega}}$ under the linear map $\vech(\mat{\Omega}) = \pinv{\mat{D}_p}\vec{\mat{\Omega}}$ leading to the differential $\d\vech{\mat{\Omega}} = \pinv{\mat{D}_p}\vec{\d\mat{\Omega}}$. By applying \cref{thm:kron-manifold-tangent-space} for $\manifold{K}_{\mat{B}}$ and $\manifold{K}_{\mat{\Omega}}$ gives \begin{align*} \mat{P}_{\mat{B}_0} &= \mat{S}_{\mat{p}, \mat{q}}[\mat{\Gamma}_{\mat{\beta}_1}\mat{\gamma}_{\mat{\beta}_1}, \ldots, \mat{\Gamma}_{\mat{\beta}_r}\mat{\gamma}_{\mat{\beta}_r}], \\ \mat{P}_{\mat{\Omega}_0} &= \pinv{\mat{D}_p}\mat{S}_{\mat{p}, \mat{p}}[\mat{\Gamma}_{\mat{\Omega}_1}\mat{\gamma}_{\mat{\Omega}_1}, \ldots, \mat{\Gamma}_{\mat{\Omega}_r}\mat{\gamma}_{\mat{\Omega}_r}] \end{align*} where the matrices $\mat{S}_{\mat{p}, \mat{q}}$, $\mat{\Gamma}_{\mat{\beta}_j}$, $\mat{\gamma}_{\mat{\beta}_j}$, $\mat{\Gamma}_{\mat{\Omega}_j}$ and $\mat{\gamma}_{\mat{\Omega}_j}$ are described in \cref{thm:kron-manifold-tangent-space} for the Kronecker manifolds $\manifold{K}_{\mat{B}}$ and $\manifold{K}_{\mat{\Omega}}$. Leading to \begin{equation}\label{eq:param-manifold-span} \mat{P}_{\mat{\theta}_0} = \begin{pmatrix} \mat{I}_p & 0 & 0 \\ 0 & \mat{S}_{\mat{p}, \mat{q}}[\mat{\Gamma}_{\mat{\beta}_1}\mat{\gamma}_{\mat{\beta}_1}, \ldots, \mat{\Gamma}_{\mat{\beta}_r}\mat{\gamma}_{\mat{\beta}_r}] & 0 \\ 0 & 0 & \pinv{\mat{D}_p}\mat{S}_{\mat{p}, \mat{p}}[\mat{\Gamma}_{\mat{\Omega}_1}\mat{\gamma}_{\mat{\Omega}_1}, \ldots, \mat{\Gamma}_{\mat{\Omega}_r}\mat{\gamma}_{\mat{\Omega}_r}] \end{pmatrix}. \end{equation} \end{proof} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Demonstration of the Advantages of Tensor Representations}\label{sec:GMLMvsVecEfficiency} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%% Minor revision reviewer comment % My concerns have been largely resolved. One minor point remains: % % I appreciate the authors’ clarification that the Kronecker-structured % parameterization improves efficiency by exploiting the tensor structure. % While the rationale is sound, readers would benefit from a brief numerical % illustration--perhaps in the supplementary material--demonstrating empirical % variance reduction or faster convergence relative to an unconstrained fit. % Including a small, illustrative example would make the claimed efficiency gain % more concrete and convincing. We illustrate the competitive advantage of tensor structure as compared with vector representation using a simple example. We generate standard normal responses $y_i$, for $i = 1, \ldots, n$. For this simulation, we set the GMLM parameters $\overline{\ten{\eta}} = \mat{0}$, $\mat{\beta}_1 = \mat{\beta}_2 = \t{(-1, 1, -1, 1, \ldots)}\in\mathbb{R}^{\tilde{p}}$ to vectors of alternating signs, and $\mat{\Sigma}_1 = \mat{\Sigma}_2 = \mat{I}_{\tilde{p}}$ so that $\mat{\alpha}_j=\mat{\Sigma}_j^{-1}\mat{\beta}_j=\mat{\beta}_j$, $j=1,2$, and %the precision matrices $\mat{\Omega}_1 = \mat{\Omega}_2 = \mat{I}_{\tilde{p}}$. Given $y_i$, we draw matrix-valued $\ten{X}_i$'s of dimension $p_1\times p_2$, where $p_1 = p_2 = \tilde{p}$ from \begin{displaymath} \ten{X}_i = \bar{\mat{\eta}} + \mat{\alpha}_1\ten{F}_{y_i}\t{\mat{\alpha}_2} + \mat{\epsilon}_i \end{displaymath} where $\ten{F}_{y_i}\equiv y_i$ is a $1\times 1$ matrix, $\vec{\mat{\epsilon}_i}\sim\mathcal{N}(\mat{0}, \mat{\Sigma}_2\otimes\mat{\Sigma}_1)$. That is, $\ten{X}\mid Y = y_i$ is matrix normal with parameters as specified above. We compare four different estimators for the vectorized reduction matrix $\mat{B} = \mat{\beta}_2\otimes\mat{\beta}_1\in\mathbb{R}^{\tilde{p}^2}$. First, a linear model, which, in this scenario, is equivalent to the vectorized GMLM model if the sample size exceeds the dimensionality $\tilde{p}^2$ of the vectorized predictors $\vec(\ten{X})$. Second, our GMLM model applied to the vectorized predictors, which are distributed as multi-variate normal (tensor order $r = 1$) conditionally on $y$. Third, our GMLM model as designed for matrix-valued predictors (tensor order $r = 2$). Fourth, another GMLM model for matrix-valued predictors but now with a manifold constraint on the precision matrices $\mat{\Omega}_1, \mat{\Omega}_2$ to be scaled diagonal matrices. This means we let $\mat{\Omega}_1, \mat{\Omega}_2\in\{ a\mat{I}_{\tilde{p}} : 0 < a \}$, which is a one-dimensional matrix manifold from \cref{tab:matrix-manifolds}. We did not specify $a$ to be 1 to operate in a context where there is not enough information to uniquely resolve the scale. In the first two cases the reduction matrix estimate, $\hat{\mat{B}}$, is computed directly. In contrast, the matrix-valued GMLM models provide the mode-wise estimates $\hat{\mat{\beta}}_1, \hat{\mat{\beta}}_2$ which are then used to compute $\hat{\mat{B}} = \hat{\mat{\beta}}_2\otimes\hat{\mat{\beta}}_1$. For accuracy assessment we use the subspace distance $d(\mat{B}, \hat{\mat{B}})$ from \cref{sec:simulations} for comparison of the estimates $\hat{\mat{B}}$ versus the true reduction $\mat{B} = \mat{\beta}_2\otimes\mat{\beta}_1$ used for data generation. The simulation was carried out for different sample sizes $n = 100, 200, 300, 500, 750$ and increasing matrix row and column dimension $\tilde{p} = 2, 3, 4, 5, 6, 7, 9, 11, 13, 15, 18, 22, 27, 32$ of the predictors $\ten{X}$. The results are visualized in \cref{fig:GMLMvsVecEfficiency}, where we plot the mean $\pm$ one standard deviation of the subspace distance over 100 replicates. Both GMLM estimators perform uniformly better irrespective of $\tilde{p}$ that ranges from 2 to 32 in this simulation. Between these two, the constrained model, which is the closest to the true variance-covariance structure, practically reproduces the true model. Also, they both exhibit roughly the same accuracy, and in particular no deterioration, despite the increase in dimension $\tilde{p}$. The linear model performs as expected. It drastically deteriorates as the dimension increases. The yellow lines stop where the dimension $\tilde{p}^2$ exceeds the sample size $n$. Similarly, the GMLM applied to the vectorized matrix-valued $\ten{X}$ has identical performance as the plain vanilla linear model up to the point that the regularization kicks in, as described in \cref{sec:tensor-normal-estimation}. For example, for $n=750$, the performance of the two models is identical up to $\tilde{p}=18$ and then it improves till it hits its floor. The latter will nevertheless always exceed the corresponding accuracy of either tensor versions of the GMLM model, with and without the constraint. The accuracy is monotone increasing as a function of the sample size across all four models. That is, the closer the lines are to the $x$-axis the higher the sample size is across $\tilde{p}$. The only exception can be seen in the GMLM applied to the vectorized predictors (red lines) in the transition phase of GMLM using regularization to deal with $\tilde{p}^2 > n$. \begin{figure} \centering \caption{\label{fig:GMLMvsVecEfficiency}Accuracy Gain: Tensor-valued versus Vectorized Predictors} \includegraphics[width = \textwidth]{plots/sim_efficiency.pdf} \end{figure} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Investigation and Comparison of GMLM versus TSIR}\label{sec:GMLMvsTSIR} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% TSIR suffers substantial loss in predictive accuracy in the full 2D EEG data (see \cref{tab:eeg}) as compared to the lower dimensional setting of pre-processed $\ten{X}$. Compelling evidence suggests that this is due to TSIR not directly estimating the mode-wise reduction matrices $\mat{\beta}_j$: It estimates the mode-wise covariances $\mat{\Sigma}_j$ and mode-wise reduction matrices $\mat{\Gamma}_j$ for standardized observations and sets $\widehat{\mat{\beta}}_j = \widehat{\mat{\Sigma}}_j^{-1}\widehat{\mat{\Gamma}}_j$. The estimation accuracy of the $\widehat{\mat{\beta}}_j$s depends on both $\widehat{\mat{\Sigma}}_j$ and $\widehat{\mat{\Gamma}}_j$. Specifically, as the dimensions increase, the mode-wise covariances are ill-conditioned, leading to massive errors in the estimation accuracy of the reduction matrices $\widehat{\mat{\beta}}_j$. \begin{figure} \centering \caption{\label{fig:TSIRvsGMLM}TSIR in red vs. GMLM in black} \includegraphics[width = \textwidth]{plots/tsir_eeg_2d_bad_perf.pdf} \end{figure} To observe this in practice, we took the full 2D EEG data (only the S1 stimulus) and computed $\widehat{\mat{\Gamma}}^{\mathrm{TSIR}}_1, \widehat{\mat{\Gamma}}^{\mathrm{TSIR}}_2$, $\widehat{\mat{\Sigma}}^{\mathrm{TSIR}}_1, \widehat{\mat{\Sigma}}^{\mathrm{TSIR}}_2$ from which we computed the reductions $\widehat{\mat{\beta}}^{\mathrm{TSIR}}_1$ and $\widehat{\mat{\beta}}^{\mathrm{TSIR}}_2$. For comparison, we used GMLM to estimate the reduction matrices $\widehat{\mat{\beta}}^{\mathrm{GMLM}}_1$, $\widehat{\mat{\beta}}^{\mathrm{GMLM}}_2$ and the scatter matrices $\widehat{\mat{\Omega}}^{\mathrm{GMLM}}_1$, $\widehat{\mat{\Omega}}^{\mathrm{GMLM}}_2$. We also computed the GMLM analog of the standardized reductions $\widehat{\mat{\Gamma}}^{\mathrm{GMLM}}_1 = (\widehat{\mat{\Omega}}^{\mathrm{GMLM}}_1)^{-1}\widehat{\mat{\beta}}_1^{\mathrm{GMLM}}$ and $\widehat{\mat{\Gamma}}^{\mathrm{GMLM}}_2 = (\widehat{\mat{\Omega}}^{\mathrm{GMLM}}_2)^{-1}\widehat{\mat{\beta}}_2^{\mathrm{GMLM}}$. We next plotted in \cref{fig:TSIRvsGMLM} the estimated reductions $\widehat{\mat{\beta}}^{\mathrm{GMLM}}_1$, $\widehat{\mat{\beta}}^{\mathrm{GMLM}}_2$, $\widehat{\mat{\Gamma}}^{\mathrm{GMLM}}_1$, $\widehat{\mat{\Gamma}}^{\mathrm{GMLM}}_2$ in black and $\widehat{\mat{\beta}}^{\mathrm{TSIR}}_1$, $\widehat{\mat{\beta}}^{\mathrm{TSIR}}_2$, $\widehat{\mat{\Gamma}}^{\mathrm{TSIR}}_1$, $\widehat{\mat{\Gamma}}^{\mathrm{TSIR}}_2$ in red (with appropriate scaling). Since the estimates of the reductions $\widehat{\mat{\beta}}_j$'s are completely different between GMLM and TSIR but the estimates of the standardized reductions $\mat{\Gamma}_j$'s agree to a high degree, we conclude that the loss of estimation accuracy happens in the final calculations that %That is in computing the estimates of the reductions $\mat{\beta}_j = \mat{\Sigma}_j^{-1}\mat{\Gamma}$, which involve the inversion of the severely ill-conditioned mode-wise estimated covariances $\widehat{\mat{\Sigma}}^{\mathrm{TSIR}}_1$ and $\widehat{\mat{\Sigma}}^{\mathrm{TSIR}}_2$. To check whether our conjecture holds, we introduced a regularization term similar to the regularization employed in GMLM to the mode-wise sample covariances $\mat{\Sigma}_j$ in TSIR. As can be seen in \cref{fig:TSIRregvsGMLM}, this produced roughly the same reductions $\widehat{\mat{\beta}}_j$ as well as the $\widehat{\mat{\Gamma}}_j$. \begin{figure} \caption{TSIR (reg) in red vs. GMLM in black} \label{fig:TSIRregvsGMLM} \includegraphics[width = \textwidth]{plots/tsir_regularized.pdf} \end{figure} To further study the relationship between GMLM and TSIR, we first consider a setting in which GMLM and TSIR are theoretically equivalent. We assume that $Y$ is categorical, $\ten{X}\mid Y = y$ is multi-linear normal and the reduction $\ten{R}(\ten{X})$ is one-dimensional. Additionally, $\mat{\Omega}_j$ is assumed to be positive definite but \emph{not} further constrained. Under these assumptions, GMLM and TSIR are equivalent since %Even though GMLM minimizes the log-likelihood and TSIR minimizes a squared error loss, the theoretical mode-wise reduction matrices minimizing the log-likelihood (GMLM's objective) and the squared error loss (TSIR's objective) are theoretically identical. In practice though, even in this simplest of settings, both procedures do not agree fully in their estimates. This is based on multiple different factors. \begin{enumerate} \item Both procedures use different parameterizations. The reduction matrices $\mat{\beta}_j$ are parameters in GMLM in contrast to TSIR. As a result, TSIR first estimates its parameters $\mat{\Gamma}_j$ and mode-wise covariances $\mat{\Sigma}_j$ (corresponding to the marginal mode-wise covariances $\cov(\ten{X}_{(j)}$) and then estimates $\mat{\beta}_j$ with $\widehat{\mat{\beta}}_j =\widehat{\mat{\Sigma}}_j^{-1}\widehat{\mat{\Gamma}}_j$. This ``two-step'' approach induces accumulation of estimation errors. Moreover, if any $\widehat{\mat{\Sigma}}_j$ is ill-conditioned, the resulting $\widehat{\mat{\beta}}_j$ is very unstable. In contrast, GMLM incorporates the reductions $\mat{\beta}_j$ directly into the model and estimates both $\mat{\beta}_j$ and $\mat{\Omega}_j$ at the same time, so that the estimation error in $\mat{\Omega}_j$ does \emph{not} affect the reductions. %But, in contrast to TSIR, GMLM does estimate both $\mat{\beta}_j$ and $\mat{\Omega}_j$ at the same time, linking the estimation accuracy of both in a different way. This is due to GMLM's mode-wise scatter matrices $\mat{\Omega}_j$ are estimated from the inverse regression residuals (corresponding to $(\E\cov(\ten{X}_{(j)}\mid Y))^{-1}$). \item The original version of TSIR \cite[]{DingCook2015} does \emph{not} incorporate any regularization. We introduced regularization to TSIR leading to the TSIR (reg) procedure by simply adding $0.2\lambda_1(\mat{\Sigma}_j)\mat{I}_{p_j}$ to $\mat{\Sigma}_j$ if and only if the condition number of $\mat{\Sigma}_j$ exceeds an arbitrary threshold of $25$, similar to the regularization used in GMLM. Without this regularization, TSIR can fail dramatically. %in the final computation $\mat{\beta}_j = \mat{\Sigma}_j^{-1}\mat{\Gamma}_j$, which is the reason for the drop in AUC in \cref{tab:eeg} for TSIR in contrast to TSIR (reg). In comparison, GMLM behaves more stable in regard to estimating the reductions $\mat{\beta}_j$ even without regularization. At the same time, the need for regularization is more pronounced in GMLM. This means that even in well behaved settings, the estimation algorithm for GMLM does rely on its regularization to guard against divergence in the iterative estimation procedure. In summary, without regularization, TSIR may encounter complete breakdown without regularization while GMLM has worse average performance (not directly reported since the regularization in GMLM is the default for the multi-linear normal algorithm, applied only in a subset of iterations). \end{enumerate} If the response $Y$ is a univariate continuous variable, then GMLM and TSIR are no longer equivalent (from a theoretical point of view.) \begin{enumerate}\setcounter{enumi}{2} \item TSIR performs slicing of the continuous response $y$ which misses fine-grained information. GMLM does not have this restriction, being able to directly work with the continuous response. This is the main reason for the slight performance gain of GMLM over TSIR in the simulations of \cref{sec:sim-tensor-normal} (due to the small dimensionality of the simulations, regularization for TSIR is not needed while GMLM applies regularization during its iterative estimation procedure for algorithmic stability only. Meaning, the final GMLM estimates are also almost never regularized). \item Another major difference between GMLM and TSIR is the tensor-valued function $\ten{F}_y$ of the response, which makes GMLM much more general than TSIR. If $\ten{F}_y$ is the fraction of observations in a slice, TSIR is in effect a special case of GMLM. %During the modeling process of GMLM we introduced .% $y$. %This was assumed as a \emph{known} function of $y$, which in practice is subject to modeling. This is another major difference between GMLM and TSIR, manifesting in two parts. For general $\ten{F}_y\in\mathbb{R}^{q_1\times\ldots q_r}$, its dimension, $q = \prod_{j = 1}^r q_j$, inherently constrains the dimension of the reduction in \cref{thm:sdr}. TSIR also requires to provide a per-mode reduction dimension $\tilde{q}_j$ which leads to a final reduction of dimensionality $\tilde{q} = \prod_{j = 1}^r \tilde{q}_j$, but without a link to an explicitly provided function. This makes the specification of the reduction dimension simpler, while being equality constraint from a number theoretic divisibility point of view. \end{enumerate} \noindent So far, we required no additional constraints on the model parameters. \begin{enumerate}\setcounter{enumi}{4} \item Given additional information about the regression at hand, one might want to incorporate that knowledge into the method. For example, for longitudinal data, the time dimension may be modeled as an AR process. GMLM allows (under the conditions outlined in \cref{sec:manifolds}) such modeling by describing the corresponding parameter manifolds of the covariances or the reductions themselves, a functionality \emph{not} available in TSIR. \end{enumerate} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Parameter estimation for the Multi-Linear Ising model}\label{sec:ising-param-estimation} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Initial Values} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% The first step is to get reasonable starting values. Experiments showed that a good starting value of $\mat{\beta}_k$ is to use the multi-linear normal estimates from \cref{sec:tensor-normal-estimation} for $k = 1, \ldots, r$, considering $\ten{X}_i$ as continuous. For initial values of $\mat{\Omega}_k$, a different approach is required. Setting everything to the uninformative initial value results in $\mat{\Omega}_k = \mat{0}$ as this corresponds to the conditional log odds to be $1:1$ for every component and pairwise interaction. This is not possible, since $\mat{0}$ is a stationary point of the log-likelihood, as can be directly observed by considering the partial gradients of the log-likelihood in \cref{thm:grad}. Instead, we use a crude heuristic that threads every mode separately and ignores any relation to the covariates. It is computationally cheap and better than any of the alternatives we considered. For every $k = 1, \ldots, r$, let the $k$th mode second moment estimate be \begin{equation}\label{eq:ising-mode-moments} \hat{\mat{M}}_{2(k)} = \frac{p_k}{n p}\sum_{i = 1}^n (\ten{X}_i)_{(k)}\t{(\ten{X}_i)_{(k)}} \end{equation} which contains the $k$th mode first moment estimate in its diagonal $\hat{\mat{M}}_{1(k)} = \diag\hat{\mat{M}}_{2(k)}$. For the purpose of an initial value estimate, we threat each of the columns of the matricized observation $(\ten{X}_i)_{(k)}$ as an i.i.d. observation of a $p_k$ dimensional random variable $Z_k$. Through this reasoning we get with $n$ i.i.d. observations $\ten{X}_i$ a total of $n \prod_{j\neq k}p_j$ realizations of the random variable $Z_k$ for each of the modes $k = 1, \ldots, r$. Continuing this reasoning, the elements of $(\hat{\mat{M}}_{1(k)})_{j}$ are the estimates of the marginal probability $P((Z_k)_j = 1)$ of the $j$th element of $Z_k$ being $1$. Similarly, for $l \neq j$, the entry $(\hat{\mat{M}}_{2(k)})_{j l}$ estimates the marginal probability of two-way interactions, $P((Z_k)_j = 1, (Z_k)_l = 1)$. Now, we set the diagonal elements of $\mat{\Omega}_k$ to zero. For the off diagonal elements of $\mat{\Omega}_k$, we equate the conditional probabilities $P((Z_k)_j = 1 \mid (Z_k)_{-j} = \mat{0})$ and $P((Z_k)_j = 1, (Z_k)_l = 1\mid (Z_k)_{-j, -l} = \mat{0})$ with the marginal probability estimates $(\hat{\mat{M}}_{1(k)})_{j}$ and $(\hat{\mat{M}}_{2(k)})_{j l}$, respectively. Applying \eqref{eq:ising-two-way-log-odds} gives the initial component-wise estimates $\hat{\mat{\Omega}}_k^{(0)}$, \begin{equation}\label{eq:ising-init-Omegas} (\hat{\mat{\Omega}}_k^{(0)})_{j j} = 0, \qquad (\hat{\mat{\Omega}}_k^{(0)})_{j l} = \log\frac{1 - (\hat{\mat{M}}_{1(k)})_{j}(\hat{\mat{M}}_{1(k)})_{l}}{(\hat{\mat{M}}_{1(k)})_{j}(\hat{\mat{M}}_{1(k)})_{l}}\frac{(\hat{\mat{M}}_{2(k)})_{j l}}{1 - (\hat{\mat{M}}_{2(k)})_{j l}}, \, j \neq l. \end{equation} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Gradient Optimization} Given initial values, the gradients derived in \cref{thm:grad} can be evaluated for the Ising model. The first step therefore is to determine the values of the inverse link components $\ten{g}_1(\mat{\gamma}_y) = \E[\ten{X} \mid Y = y]$ and $\ten{G}_2(\mat{\gamma}_y) = \ten{g}_2(\mat{\gamma}_y) = \E[\ten{X}\circ\ten{X} \mid Y = y]$. An immediate simplification is that the first moment is part of the second moment. Its values are determined via $\vec(\E[\ten{X} \mid Y = y]) = \diag(\E[\ten{X}\circ\ten{X} \mid Y = y]_{(1, \ldots, r)})$; i.e., only the second moment needs to be computed, or estimated (see \cref{sec:ising-bigger-dim}) in the case of slightly bigger $p$. For the Ising model, the conditional second moment with parameters $\mat{\gamma}_y$ is given by the matricized relation \begin{equation}\label{eq:ising-m2} \ten{g}_2(\ten{\gamma}_y)_{(1, \ldots, r)} = \E\left[(\vec{\ten{X}})\t{(\vec{\ten{X}})}\mid Y = y\right] = p_0(\mat{\gamma}_y)\sum_{\mat{x}\in\{0, 1\}^{p}}\mat{x}\t{\mat{x}}\exp(\t{\vech(\mat{x}\t{\mat{x}})}\mat{\gamma}_y). \end{equation} The natural parameter $\mat{\gamma}_y$ is evaluated via \eqref{eq:ising-natural-params} enabling us to compute the partial gradients of the log-likelihood $l_n$ \eqref{eq:log-likelihood} for the Ising model by \cref{thm:grad} for the GMLM parameters $\mat{\beta}_k$ and $\mat{\Omega}_k$, $k = 1, \ldots, r$, at the current iterate $\mat{\theta}^{(I)} = (\mat{\beta}_1^{(I)}, \ldots, \mat{\beta}_r^{(I)}, \mat{\Omega}_1^{(I)}, \ldots, \mat{\Omega}_r^{(I)})$. Using classic gradient ascent for maximizing the log-likelihood, we have to specify a learning rate $\lambda\in\mathbb{R}_{+}$, usually a value close to $10^{-3}$. The update rule is \begin{displaymath} \mat{\theta}^{(I + 1)} = \mat{\theta}^{(I)} + \lambda\nabla_{\mat{\theta}} l_n(\mat{\theta})\bigr|_{\mat{\theta} = \mat{\theta}^{(I)}}, \end{displaymath} which is iterated till convergence. In practice, iteration is performed until either a maximum number of iterations is exhausted and/or some break condition is satisfied. A proper choice of the learning rate is needed as a large learning rate $\lambda$ may cause instability, while a very low learning rate requires an enormous amount of iterations. Generically, there are two approaches to avoid the need to determine a proper learning rate. First, \emph{line search methods} determine an appropriate step size for every iteration. This works well if the evaluation of the object function (the log-likelihood) is cheap. This is not the case in our setting, see \cref{sec:ising-bigger-dim}. The second approach is an \emph{adaptive learning rate}, where one tracks specific statistics while optimizing and dynamically adapting the learning rate via well-tested heuristics using the gathered knowledge from past iterations. We opted to use an adaptive learning rate approach, which not only removes the need to determine an appropriate learning rate but also accelerates learning. Our method of choice is \emph{root mean squared propagation} (RMSprop) \cite[]{Hinton2012}. This is a well-known method in machine learning for training neural networks. It is a variation of gradient descent with a per scalar parameter adaptive learning rate. It tracks a moving average of the element-wise squared gradient $\mat{g}_2^{(I)}$, which is then used to scale (element-wise) the gradient in the update rule (see \cite{Hinton2012} and \cite{GoodfellowEtAl2016} among others). The update rule using RMSprop for maximization\footnote{Instead of the more common minimization, therefore $+$ in the update of $\mat{\theta}$.} is \begin{align*} \mat{g}_2^{(I + 1)} &= \nu \mat{g}_2^{(I)} + (1 - \nu)\nabla l_n(\mat{\theta}^{(I)})\odot\nabla l_n(\mat{\theta}^{(I)}), \\ \mat{\theta}^{(I + 1)} &= \mat{\theta}^{(I)} + \frac{\lambda}{\sqrt{\mat{g}_2^{(I + 1)}} + \epsilon}\odot\nabla l_n(\mat{\theta}^{(I)}). \end{align*} The parameters $\nu = 0.9$, $\lambda = 10^{-3}$ and $\epsilon\approx 1.49\cdot 10^{-8}$ are fixed. The initial value of $\mat{g}_2^{(0)} = \mat{0}$, where the symbol $\odot$ denotes the Hadamard product, or element-wise multiplication. The division and square root operations are performed element-wise as well. According to our experiments, RMSprop requires iterations in the range of $50$ till $1000$ till convergence while gradient ascent with a learning rate of $10^{-3}$ is in the range of $1000$ till $10000$. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Small Data Sets}\label{sec:ising-small-data-sets} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% In the case of a finite number of observations, specifically in data sets with a small number of observations $n$, the situation where one component is always either zero or one can occur. It is also possible to observe two exclusive components. In practice, this situation of a ``degenerate'' data set should be protected against. Working with parameters on a log scale, gives estimates of $\pm\infty$, which is outside the parameter space and breaks our optimization algorithm. The first situation where this needs to be addressed is in \eqref{eq:ising-init-Omegas}, where we set initial estimates for $\mat{\Omega}_k$. To avoid division by zero as well as evaluating the log of zero, we adapt \eqref{eq:ising-mode-moments}, the mode-wise moment estimates $\hat{\mat{M}}_{2(k)}$. A simple method is to replace the ``degenerate'' components, that are entries with value $0$ or $1$, with the smallest positive estimate of exactly one occurrence $p_k / n p$, or all but one occurrence $1 - p_k / n p$, respectively. The same problem is present in gradient optimization. Therefore, before starting the optimization, we detect degenerate combinations. We compute upper and lower bounds for the ``degenerate'' element in the Kronecker product $\hat{\mat{\Omega}} = \bigkron_{k = r}^{1}\hat{\mat{\Omega}}_k$. After every gradient update, we check if any of the ``degenerate'' elements fall outside of the bounds. In that case, we adjust all the elements of the Kronecker component estimates $\hat{\mat{\Omega}}_k$, corresponding to the ``degenerate'' element of their Kronecker product, to fall inside the precomputed bounds. While doing so, we try to alter every component as little as possible to ensure that the non-degenerate elements in $\hat{\mat{\Omega}}$, affected by this change due to its Kronecker structure, are altered as little as possible. The exact details are technically cumbersome while providing little insight. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Slightly Bigger Dimensions}\label{sec:ising-bigger-dim} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% A big challenge for the Ising model is its high computational complexity as it involves summing over all binary vectors of length $p = \prod_{k = 1}^{r}p_k$ in the partition function \eqref{eq:ising-partition-function}. Exact computation of the partition function requires summing all $2^p$ binary vectors. For small dimensions, say $p\approx 10$, this is easily computed. Increasing the dimension beyond $20$ becomes extremely expensive and impossible for a dimension bigger than $30$. Trying to avoid the evaluation of the log-likelihood and only computing its partial gradients via \cref{thm:grad} does not resolve the issue. The gradients require the inverse link, that is the second moment \eqref{eq:ising-m2}, which still involves summing $2^p$ terms if the scaling factor $p_0$ is dropped. Basically, with our model, this means that the optimization of the Ising model using exactly computed gradients is impossible for moderately sized problems. When $p=\prod_{i=1}^r p_i > 20$, we use a Monte-Carlo method to estimate the second moment \eqref{eq:ising-m2}, required to compute the partial gradients of the log-likelihood. Specifically, we use a Gibbs-Sampler to sample from the conditional distribution and approximate the second moment in an importance sampling framework. This can be implemented quite efficiently and the estimation accuracy for the second moment is evaluated experimentally. Simultaneously, we use the same approach to estimate the partition function. This, though, is inaccurate and may only be used to get a rough idea of the log-likelihood. Regardless, for our method, we only need the gradient for optimization where appropriate break conditions, not based on the likelihood, lead to a working method for MLE estimation. \begin{figure}[!hpt] \centering \includegraphics[width=0.5\textwidth]{plots/sim-ising-perft-m2.pdf} \caption{\label{fig:ising-m2-perft}Performance test for computing/estimating the second moment of the Ising model of dimension $p$ using either the exact method or a Monte-Carlo (MC) simulation.} \end{figure} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Chess}\label{sec:chess} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% We provide a \emph{proof of concept} analysis of chess positions based on the Ising GMLM model where we build a mixture of matrix Ising models. Our modeling is based on analyzed games to build a sufficient reduction for winning positions. The data set is provided by the \emph{lichess.org open database}.\footnote{\emph{lichess.org open database}. visited on December 8, 2023. \url{https://database.lichess.org}} We randomly selected the November of 2023 data that consist of more than $92$ million games. We removed all games without position evaluations. The evaluations, also denoted as scores, are from Stockfish,\footnote{The Stockfish developers [see \href{https://github.com/official-stockfish/Stockfish/blob/master/AUTHORS}{AUTHORS} file] (since 2008). \textit{Stockfish}. Stockfish is a free and strong UCI chess engine. \url{https://stockfishchess.org}} a free and strong chess engine. The scores take the role of the response $Y$ and correspond to a winning probability from the white pieces' point of view. Positive scores are good for white and negative scores indicate an advantage for black pieces. We ignore all highly unbalanced positions, which we set to be positions with absolute score above $5$. We also remove all positions with a mate score (one side can force checkmate). Furthermore, we only consider positions after $10$ half-moves to avoid oversampling the beginning of the most common openings including the start position which is in every game. Finally, we only consider positions with white to move. This leads to a final data set of roughly $64$ million positions, including duplicates. We did not filter according to the players strength nor the time control of the game. This means that our data set contains games from very weak to very strong players as well as very fast games, like Bullet, to classic chess. A chess position is encoded as a set of $12$ binary matrices $\ten{X}_{\mathrm{piece}}$ of dimensions $8\times 8$. Every binary matrix encodes the positioning of a particular piece by containing a $1$ if the piece is present at the corresponding board position. The $12$ pieces derive from the $6$ types of pieces, namely pawns (\pawn), knights (\knight), bishops (\bishop), rooks (\rook), queens (\queen), and kings (\king) of two colors, black and white. See \cref{fig:fen2tensor} for a visualization. \begin{figure}[!hpt] \centering \includegraphics[scale=0.8]{images/fen2tensor.pdf}\caption{\label{fig:fen2tensor}The chess start position and its 3D binary tensor representation, empty entries are $0$.} \end{figure} We assume that $\ten{X}_{\mathrm{piece}}\mid Y = y$ follows an Ising GMLM model as in \cref{sec:ising_estimation} with different conditional piece predictors being independent. %The independence assumption is for the sake of simplicity even though this is not the case in the underlying true distribution. The simplifying assumption of independence results in a mixture model with log-likelihood \begin{displaymath} l_n(\mat{\theta}) = \frac{1}{12}\sum_{\mathrm{piece}}l_n(\mat{\theta}_{\mathrm{piece}}) \end{displaymath} where $l_n(\mat{\theta}_{\mathrm{piece}})$ is the Ising GMLM log-likelihood as in \cref{sec:ising_estimation} for $\ten{X}_{\mathrm{piece}}\mid Y = y$. For every component the same relation to the scores $y$ is modeled via a $2\times 2$ dimensional matrix-valued function $\ten{F}_y$ consisting of the monomials $1, y, y^2$, specifically $(\ten{F}_y)_{i j} = y^{i + j - 2}$. Due to the volume of the data (millions of observations), it is computationally infeasible to compute the gradients on the entire data set. %Simply using a computationally manageable subset is not an option. The dimension of the binary data is $12$ times a $8\times 8 =768$ for every observation. %The main issue is that a manageable subset, say one million observations, still leads to a degenerate data set. In our simplified mixture model, the pawns are a specific issue as there are multiple millions of different combinations of the $8$ pawns per color on the $6\times 8$ sub-grid where the pawns can be positioned. This alone does not allow us to take a reasonable sized subset for estimation. The solution is to switch from a classic gradient-based optimization to a stochastic version; that is, every gradient update uses a new random subset of the entire data set. We draw independent random samples from the data consisting of $64$ million positions. The independence of samples is derived from the independence of games, and every sample is drawn from a different game. \subsection{Validation} Given the non-linear nature of the reduction, due to the quadratic matrix-valued function $\ten{F}_y$ of the score $y$, we use a \emph{generalized additive model}\footnote{using the function \texttt{gam()} from the \texttt{R} package \texttt{mgcv}.} (GAM) to predict position scores from reduced positions. The reduced positions are $48$ dimensional continuous values by combining the $12$ mixture components from the $2\times 2$ matrix-valued reductions per piece. The per-piece reduction is \begin{displaymath} \ten{R}(\ten{X}_{\mathrm{piece}}) = \mat{\beta}_{1,\mathrm{piece}}(\ten{X}_{\mathrm{piece}} - \E\ten{X}_{\mathrm{piece}})\t{\mat{\beta}_{2, \mathrm{piece}}} \end{displaymath} which gives the complete $48$ dimensional vectorized reduction by stacking the piece-wise reductions \begin{displaymath} \vec{\ten{R}(\ten{X}}) = (\vec{\ten{R}(\ten{X}_{\text{white pawn}})}, \ldots, \vec{\ten{R}(\ten{X}_{\text{black king}})}) = \t{\mat{B}}\vec(\ten{X} - \E\ten{X}). \end{displaymath} The second line encodes all the piece-wise reductions in a block diagonal full reduction matrix $\mat{B}$ of dimension $768\times 48$ which is applied to the vectorized 3D tensor $\ten{X}$ combining all the piece components $\ten{X}_{\mathrm{piece}}$ into a single tensor of dimension $8\times 8\times 12$. This is a reduction to $6.25\%$ of the original dimension. The $R^2$ statistic of the GAM fitted on $10^5$ new reduced samples is $R^2_{\mathrm{gam}}\approx 46\%$. A linear model on the reduced data achieves $R^2_{\mathrm{lm}}\approx 26\%$ which clearly shows the non-linear relation. On the other hand, the static evaluation of the \emph{Schach H\"ornchen}\footnote{Main author's chess engine.} engine, given the full position (\emph{not} reduced), achieves an $R^2_{\mathrm{hce}}\approx 52\%$. The $42\%$ are reasonably well compared to $51\%$ of the engine static evaluation which gets the original position and uses chess specific expert knowledge. Features the static evaluation includes, which are expected to be learned by the GMLM mixture model, are the \emph{material} (piece values) and \emph{piece square tables} (PSQT, preferred piece type positions). In addition, the static evaluation includes chess specific features like \emph{king safety}, \emph{pawn structure}, or \emph{rooks on open files}. This lets us conclude that the reduction captures most of the relevant features possible, given the oversimplified modeling we performed. \subsection{Interpretation} For a compact interpretation of the estimated reduction we construct PSQTs. To do so we use the linear model from the validation section. Then, we rewrite the combined linear reduction and linear model in terms of PSQTs. Due to the nature of our analysis, it does not provide the usual PSQT interpreted as the square where a piece is most powerful, but instead that the presence of a piece on a particular square is indicative of a winning position. Those two are different in the sense that the first indicates the potential of a piece on a particular square while the second also incorporates the cases where this potential was already used, resulting in a winning position (often due to an oversight of the opponent). Let $\mat{B}$ be the $768\times 48$ full vectorized linear reduction. This is the block diagonal matrix with the $64\times 4$ dimensional per piece reductions $\mat{B}_{\mathrm{piece}} = \mat{\beta}^{\mathrm{piece}}_2\otimes\mat{\beta}^{\mathrm{piece}}_1$. Then, the linear model with coefficients $\mat{b}$ and intercept $a$ on the reduced data is given by \begin{equation}\label{eq:chess-lm} y = a + \t{\mat{b}}\t{\mat{B}}\vec(\ten{X} - \E\ten{X}) + \epsilon \end{equation} with an unknown mean zero error term $\epsilon$ and treating the binary tensor $\ten{X}$ as continuous. Decomposing the linear model coefficients into blocks of $4$ gives per piece coefficients $\mat{b}_{\mathrm{piece}}$ which combine with the diagonal blocks $\mat{B}_{\mathrm{piece}}$ of $\mat{B}$ only. Rewriting \eqref{eq:chess-lm} gives \begin{align*} y &= a + \sum_{\mathrm{piece}}\t{(\mat{B}_{\mathrm{piece}}\mat{b}_{\mathrm{piece}})}\vec(\ten{X}_{\mathrm{piece}} - \E\ten{X}_{\mathrm{piece}}) + \epsilon \\ &= \tilde{a} + \sum_{\mathrm{piece}}\langle \mat{B}_{\mathrm{piece}}\mat{b}_{\mathrm{piece}}, \vec(\ten{X}_{\mathrm{piece}}) \rangle + \epsilon \end{align*} with a new intercept term $\tilde{a}$, which is of no interest to us. Finally, we enforce color symmetry, using known mechanisms from chess engines. Specifically, mirroring the position changes the sign of the score $y$. Here, mirroring reverses the rank (row) order, this is the image in a mirror behind a chess board. Let every $\mat{C}_{\mathrm{piece}}$ be a $8\times 8$ matrix with elements $(\mat{C}_{\mathrm{piece}})_{i j} = (\mat{B}_{\mathrm{piece}}\mat{b}_{\mathrm{piece}})_{i + 8 (j - 1)}$. And denote with $\mat{M}(\mat{A})$ the matrix mirror operation which reverses the row order of a matrix. The new notation allows to enforce the symmetry, resulting in an approximate linear relationship. \begin{align*} y &= \tilde{a} + \sum_{\mathrm{piece}}\langle \mat{C}_{\mathrm{piece}}, \ten{X}_{\mathrm{piece}} \rangle + \epsilon \\ &\approx \tilde{a} + \sum_{\text{piece type}}\frac{1}{2}\langle \mat{C}_{\text{white piece}} - \mat{M}(\mat{C}_{\text{black piece}}), \ten{X}_{\text{white piece}} - \mat{M}(\ten{X}_{\text{white piece}}) \rangle + \epsilon \end{align*} If $\mat{C}_{\text{white piece}} = -\mat{M}(\mat{C}_{\text{black piece}})$ for every piece type, then we have equality ($6$ types, \emph{not} distinguishing between color.) In our case, this is valid given that the estimates $\hat{\mat{C}}_{\mathrm{piece}}$ fulfill this property with a small error. The $6$ matrices $(\mat{C}_{\text{white piece}} - \mat{M}(\mat{C}_{\text{black piece}})) / 2$ are called \emph{piece square tables} (PSQT) which are visualized in \cref{fig:psqt}. The interpretation of those tables is straightforward. A high positive value (blue) means that it is usually good to have a piece of the corresponding type on that square while a high negative value (red) means the opposite. It needs to be considered that the PSQTs are for quiet positions only, which means all pieces are save in the sense that there is no legal capturing moves nor is the king in check. \begin{figure}[t] \centering %\includegraphics[width = \textwidth]{plots/psqt.pdf} \includegraphics[scale=0.8]{plots/psqt.pdf} \caption{\label{fig:psqt}Extracted PSQTs (piece square tables) from the chess example GMLM reduction.} \end{figure} The first visual effect in \cref{fig:psqt} is the dark blue PSQT of the Queen followed by a not-so-dark Rook PSQT. This indicates that the Queen, followed by the Rook, are the most valuable pieces (after the king which is the most valuable, which also implies that assigning value to the king makes no sense). The next two are the Knight and Bishop which have higher value than the Pawns, ignoring the $6$th and $7$th rank as this makes the pawns potential queens. This is the classic piece value order known in chess. Next, going over the PSQTs one by one, a few words about the preferred positions for every piece type. The pawn positions are specifically good on the $6$th and especially on the $7$th rank as this threatens a promotion to a Queen (or Knight, Bishop, Rook). The Knight PSQT is a bit surprising, the most likely explanation for the knight being good in the enemy territory is that it got there by capturing an enemy piece for free. A common occurrence in low-rated games is a big chunk of the training data, ranging over all levels. The Bishops seem to have no specific preferred placement, only a slightly higher overall value than pawns, excluding pawns imminent for a promotion. Continuing with the rooks, we see that the rook is a good attacking piece, indicated by a save rook infiltration\footnote{Rook infiltration is a strategic concept in chess that involves skillfully maneuvering your rook to penetrate deep into your opponent's territory.}. It is well known chess knowledge that the Rook is most powerful on the 7th rank, but this is different from our analysis as this is related to the potential of a Rook in contrast to the position of a Rook in a winning position over all states of the game. The Queen is powerful almost everywhere, only the outer back rank squares (lower left and right) tend to reduce her value. This is rooted in the queen's presence there being a sign for being chased by enemy pieces. Leading to a lot of squares being controlled by the enemy hindering one own movement. Finally, given the goal of the game is to checkmate the king, a safe position for the king is very valuable. This is seen by the back rank (rank $1$) being the only non-penalized squares. Furthermore, the safest squares are the castling\footnote{Castling is a maneuver that combines king safety with rook activation.} target squares ($g1$ and $c1$) as well as the $b1$ square. Shifting the king over to $b1$ is quite common to protect the $a2$ pawn so that the entire pawn shield in front of the king is protected. The results of our analysis above agree with the configuration of the chess board most associated with observed chess game outcomes. This analysis also aligns with the understanding of human chess players of an average configuration at any moment during the game when considering the different interpretation of being in a winning position in contrast to the power of a piece on a specific square. Moreover, our analysis was performed over a wide range of player skill levels while disregarding if a position is in the middle or end game. \end{appendix} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% Support information, if any, %% %% should be provided in the %% %% Acknowledgements section. %% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %\begin{acks}[Acknowledgments] %The authors would like to thank the anonymous referees, an Associate Editor and the Editor for their constructive comments that improved the quality of this paper. %\end{acks} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% Funding information, if any, %% %% should be provided in the %% %% funding section. %% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % \begin{funding} % Both authors were partially supported by the Vienna Science and Technology Fund (WWTF) [10.47379/ICT19018] and the Austrian Science Fund (FWF) research % project P 30690-N35. % \end{funding} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% Supplementary Material, including data %% %% sets and code, should be provided in %% %% {supplement} environment with title %% %% and short description. It cannot be %% %% available exclusively as external link. %% %% All Supplementary Material must be %% %% available to the reader on Project %% %% Euclid with the published article. %% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % \begin{supplement} % \stitle{???} % \sdescription{???.} % \end{supplement} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% The Bibliography %% %% %% %% imsart-???.bst will be used to %% %% create a .BBL file for submission. %% %% %% %% Note that the displayed Bibliography will not %% %% necessarily be rendered by Latex exactly as specified %% %% in the online Instructions for Authors. %% %% %% %% MR numbers will be added by VTeX. %% %% %% %% Use \cite{...} to cite references in text. %% %% %% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \bibliographystyle{imsart-nameyear} % Style BST file (imsart-number.bst or imsart-nameyear.bst) \bibliography{references} % Bibliography file (usually '*.bib') \end{document}