update: AOS accepted manuscript to new IMS article class,

fix: chess example typos (i guess?!)
2025-11-16 17:24:07 +01:00 · 2025-11-16 17:24:07 +01:00 · e04014bcf0
commit e04014bcf0
parent 4beea0b12f
14 changed files with 391 additions and 293 deletions
--- a/AOS-accepted/images/TorustangentSpace.pdf
+++ b/AOS-accepted/images/TorustangentSpace.pdf
--- a/AOS-accepted/images/embeddImage.pdf
+++ b/AOS-accepted/images/embeddImage.pdf
--- a/AOS-accepted/images/fen2tensor.pdf
+++ b/AOS-accepted/images/fen2tensor.pdf
--- a/AOS-accepted/images/reduction.pdf
+++ b/AOS-accepted/images/reduction.pdf
--- a/AOS-accepted/imsart.sty
+++ b/AOS-accepted/imsart.sty
@ -65,7 +65,7 @@
 % Requires Latex2e, ver.2000.06
 %
 \def\imsfmt@name{imsart}
-\def\imsfmt@version{2023/05/19}
+\def\imsfmt@version{2025/07/23}
 %
 \ProvidesFile{imsart.sty}[\imsfmt@version\space IMS article style]
 %
@ -172,6 +172,9 @@
 % for new thanksref syntax
 \newif\if@updated@thanksref \@updated@thanksreffalse
 %
 % error message if xr or xr-hyper packages are loaded
 \newif\if@load@xr@or@xr@hyper \@load@xr@or@xr@hyperfalse
 %
 % Information about journals
 %
 \def\set@generic{\def\@tempa{-generic}}
@ -195,6 +198,8 @@
    \def\journal@issn{ISSN: 1549-5787}%
    \def\journal@url{https://imstat.org/journals-and-publications/probability-surveys/}%
    \@ejs@ps@ss@layouttrue
    \@print@bid@doitrue
    \@imsart@showframetrue
    }
 %
 % Information about journals
@ -204,6 +209,8 @@
    \def\journal@issn{ISSN: 1935-7516}%
    \def\journal@url{https://imstat.org/journals-and-publications/statistics-surveys/}%
    \@ejs@ps@ss@layouttrue
    \@print@bid@doitrue
    \@imsart@showframetrue
    }
 %
 % Information about journals
@ -213,6 +220,19 @@
    \def\journal@issn{ISSN: 1935-7524}%
    \def\journal@url{https://imstat.org/journals-and-publications/electronic-journal-of-statistics/}%
    \@ejs@ps@ss@layouttrue
    \@print@bid@doitrue
    \@imsart@showframetrue
    }
 \newif\if@ejsvtwo@layout \@ejsvtwo@layoutfalse
 \DeclareOption{ejsv2}{%
    \def\journal@id{-ejs}%
    \def\journal@name{Electronic Journal of Statistics }%
    \def\journal@issn{ISSN: 1935-7524}%
    \def\journal@url{https://imstat.org/journals-and-publications/electronic-journal-of-statistics/}%
    \@ejs@ps@ss@layouttrue
    \@print@bid@doitrue
    \@imsart@showframetrue
    \@ejsvtwo@layouttrue
    }
 %
 % Options for the IMS journals:
@ -231,6 +251,8 @@
    \@autosecdottrue
    \@print@bid@doitrue
    \@updated@thanksreftrue
    \@load@amsthmtrue
    \@load@xr@or@xr@hypertrue
    }
 %
 \DeclareOption{aop}{%
@ -247,6 +269,8 @@
    \@autosecdottrue
    \@print@bid@doitrue
    \@updated@thanksreftrue
    \@load@amsthmtrue
    \@load@xr@or@xr@hypertrue
    }
 %
 \DeclareOption{aos}{%
@ -263,6 +287,8 @@
    \@autosecdottrue
    \@print@bid@doitrue
    \@updated@thanksreftrue
    \@load@amsthmtrue
    \@load@xr@or@xr@hypertrue
    }
 %
 \DeclareOption{aoas}{%
@ -279,6 +305,8 @@
    \@autosecdottrue
    \@print@bid@doitrue
    \@updated@thanksreftrue
    \@load@amsthmtrue
    \@load@xr@or@xr@hypertrue
    \AtBeginDocument{%
        \@ifpackagewith{natbib}{numbers}{%
            \@latex@error{Only author-year citations are allowed!
@ -298,13 +326,15 @@
    \@stslayouttrue
    \@twocolumntrue
    \@print@bid@doitrue
    \@load@amsthmtrue
    \@load@xr@or@xr@hypertrue
    \AtEndPreamble{\if@twocolumn\set@page@layout{528.62625pt}{702.75pt}\fi}%
    }
 %
 \DeclareOption{bj}{%
    \def\journal@id{-bj}%
    \def\journal@name{Submitted to Bernoulli }%
-    \def\journal@url{http://www.bernoulli-society.org/index.php/publications/bernoulli-journal/bernoulli-journal}%
+    \def\journal@url{https://www.bernoullisociety.org/index.php/publications/bernoulli-journal/bernoulli-journal}%
    \def\journal@issn{1350-7265}%
    \set@page@layout{34pc}{574pt}%
    \@twosidetrue
@ -316,6 +346,7 @@
    \@load@amsmathtrue
    \@load@natbibtrue
    \@imsart@showframetrue
    \@load@xr@or@xr@hypertrue
    }
 %
 \DeclareOption{aihp}{%
@ -330,6 +361,8 @@
    \@aihplayouttrue
    \@updated@thanksreftrue
    \@print@bid@doitrue
    \@load@amsthmtrue
    \@load@xr@or@xr@hypertrue
    \AtBeginDocument{%
        \@ifpackagewith{natbib}{authoryear}{%
            \@latex@error{Only numerical citations are allowed!
@ -349,6 +382,8 @@
    \@bjpslayouttrue
    \@updated@thanksreftrue
    \@print@bid@doitrue
    \@load@amsthmtrue
    \@load@xr@or@xr@hypertrue
    \AtBeginDocument{%
        \@ifpackagewith{natbib}{numbers}{%
            \@latex@error{Only author-year citations are allowed!
@ -357,6 +392,11 @@
        }%
    }
 %
 %% Disable xr, xr-hyper error for use with supplement material
 \DeclareOption{supplement}{%
    \@load@xr@or@xr@hyperfalse
 }
 %
 %% Layouts:
 % IMS journals AAP, AOP, AOS layout:
 \newif\if@imslayout \@imslayoutfalse
@ -400,6 +440,10 @@
 \DeclareOption{showframe}{\@imsart@showframetrue}
 \DeclareOption{noshowframe}{\@imsart@showframefalse}
 %
 % twoside (to catch direct use)
 \newif\if@imsart@twoside \@imsart@twosidefalse
 \DeclareOption{twoside}{\global\@imsart@twosidetrue}
 %
 % Combined options:
 %
 % Use this option for submission for pier review:
@ -472,7 +516,7 @@
    \addtolength\topmargin{-.5\topmargin}%
    \@settopoint\topmargin
    %
-    % check if tw and th are'nt altered
+    % check if tw and th aren't altered
    \xdef\imsart@check@textwidth{\the\textwidth}%
    \xdef\imsart@check@textheight{\the\textheight}%
    \AtEndDocument{%
@ -510,6 +554,19 @@
 \ExecuteOptions{}
 \ProcessOptions
 %
 \csname imsart@fileinfo\endcsname
 \ifdefined\jobstart@date@info
 \else
    \edef\jobstart@date@info{\the\year\two@digits{\the\month}\two@digits{\the\day}}
 \fi
 %
 \if@bjlayout
    \if@imsart@nameyear
    \else
        \PassOptionsToPackage{sort&compress}{natbib}%
    \fi
 \fi
 %
 % dimensions
 %
 \setlength\parindent{12\p@}
@ -527,6 +584,17 @@
 \setlength\medskipamount{12\p@ \@plus 3\p@ \@minus 3\p@}
 \setlength\bigskipamount{18\p@ \@plus 3\p@ \@minus 3\p@}
 %
 \if@imsart@twoside
 \else
    \if@ejs@ps@ss@layout
        % it's twoside but centered
        \advance\evensidemargin by \oddsidemargin
        \divide\evensidemargin by \tw@
        \oddsidemargin=\evensidemargin
        \marginparwidth=65\p@% as in article with oneside
    \fi
 \fi
 %
 % setpkgattr, getpkgattr, do@option@list
 %
 \def\sep@key@value#1=#2/?/#3{\setpkgattr{#3}{#1}{#2}}
@ -1287,6 +1355,10 @@
 \setpkgattr{tablename}{size}{\scshape}
 \setpkgattr{tablename}{skip}{\endgraf}
 %
 \if@ejs@ps@ss@layout
    \setpkgattr{tablename}{size}{\scshape\upshape}
 \fi
 %
 % figure : use \@makecaption:
 \renewcommand\figurename{Fig}
 %
@ -1307,6 +1379,10 @@
 \setpkgattr{figurename}{size}{\scshape}
 \setpkgattr{figurename}{skip}{.~}
 %
 \if@ejs@ps@ss@layout
    \setpkgattr{figurename}{size}{\scshape\upshape}
 \fi
 %
 \def\@floatboxreset{%
    \reset@font
    \@setminipage
@ -1655,7 +1731,13 @@
        \setaftersec@dot{#5}%
        \def\@svsechd{#4{\hskip #1\relax #5\aftersec@dot}}%
    \fi
-    \ifnum\ssection@level=1\phantomsection\addcontentsline{toc}{section}{#5}\fi
+    \ifnum\ssection@level=1%
        \ifdefined\HCode
        \else
            \phantomsection
        \fi
        \addcontentsline{toc}{section}{#5}%
    \fi
    \@xsect{#3}%
    }
 %
@ -1771,6 +1853,9 @@
        }%
    }
 \setpkgattr{copyright}{owner}{$\copyright$~\@copyrightyear \copyrightowner@text}
 \ifdefined\HCode
    \def\copyright@owner@prefix{Copyright }
 \fi
 \setpkgattr{author}{prefix}{}
 \setpkgattr{keyword}{postfix}{\unskip.}
 %
@ -2291,17 +2376,11 @@
 %
 % \kwd[; ]{foo}
 \def\sep{\unskip\string, }
 \newif\if@firstkeywordinlist \@firstkeywordinlisttrue
 %
 \DeclareRobustCommand*\kwd{\@ifnextchar[\@kwd{\@kwd[\kwd@sep]}}
 \def\@kwd[#1]#2{%
    \unskip#1{#2}%
-    \if@firstkeywordinlist
+    \addto@keywords@list{#2}%
        \addto@keywords@list{#2}%
        \@firstkeywordinlistfalse
    \else
        \addto@keywords@list{, #2}%
    \fi
    \let\kwd@sep\sep
    }
 %
@ -2533,10 +2612,60 @@
    \def\frenchspacing{\sfcode`\.1006\sfcode`\?1005\sfcode`\!1004%
        \sfcode`\:1003\sfcode`\;1002\sfcode`\,1001 }%
    }
 % set newtx as main font
 \def\set@imsart@newtx{%
    \csname set@imsart@newtx@hook\endcsname
    \RequirePackage[T1]{fontenc}%
    \IfFileExists{nimbusserif.sty}%
        {%
            \RequirePackage{nimbusmono}%
            \RequirePackage[scaled=0.9]{nimbussans}%
            \ifnum\jobstart@date@info>20250531
                \ifdefined\HCode
                \else
                    \RequirePackage{nimbusserif}%
                    \AtEndOfClass{%
                        \DeclareFontShape{T1}{NimbusSerif}{m}{sc}{<-> utmrc8t}{}%
                        \DeclareFontShape{T1}{NimbusSerif}{b}{sc}{<-> utmbc8t}{}%
                        \DeclareFontShape{T1}{NimbusSerif}{bx}{sc}{<->ssub * NimbusSerif/b/sc}{}%
                        }%
                \fi
            \else
                \def\rmdefault{utm}%
                \def\familydefault{\rmdefault}%
            \fi
        }%
        {%
            \PackageWarning{imsart}{nimbusserif package isn't available, 
                falling back to default Times Roman equivalent font}%
            \RequirePackage{times,courier}%
            \RequirePackage[scaled=.9]{helvet}%
        }%
    \RequirePackage{textcomp}%
    \ifdefined\HCode
        \RequirePackage{amsthm,amsmath}%
    \else
        \IfFileExists{newtxmath.sty}%
            {%
                \ifdefined\DocumentMetadata% >=2022
                    \RequirePackage[amsthm]{newtxmath}%
                \else
                    \RequirePackage{amsthm}%
                    \RequirePackage{newtxmath}%
                \fi 
            }%
            {%
                \PackageWarning{imsart}{newtxmath package isn't available, 
                    amsmath and mathptmx are used instead}%
                \RequirePackage{amsthm,amsmath,mathptmx}%
            }%
    \fi
    }
 % Set up parameters for the EJS, PS, SS layout:
 \if@ejs@ps@ss@layout
    \RequirePackage{lmodern}
    \usepackage[T1]{fontenc}
    \csname UseLegacyTextSymbols\endcsname% for text bullets, etc.
    \AtEndOfPackage{%
        \ifcsundef{do@not@scale@lmex}%
            {% make lmex10 scalable
@ -2548,11 +2677,69 @@
            {}%
        }%
 \fi
 \if@ejsvtwo@layout
    \def\@xipt{11}
    \def\@xivpt{14}
    \def\@xvipt{16}
    \def\@xviiipt{18}
    \def\@xxpt{20}
    \def\@xxivpt{24}
    \set@imsart@newtx
    \declaremathsizes@as@in@mathtime
    \renewcommand\normalsize{%
        \@setfontsize\normalsize\@xipt{14\p@ plus .5\p@ minus .5\p@}%
        \abovedisplayskip 8\p@ \@plus3\p@ \@minus3\p@
        \belowdisplayskip\abovedisplayskip
        \abovedisplayshortskip\abovedisplayskip
        \belowdisplayshortskip\abovedisplayskip
        \let\@listi\@listI
        }
    \normalfont\normalsize
    \renewcommand\small{%
        \@setfontsize\small\@xpt{12.5pt plus .4pt minus .4pt}%
        \abovedisplayskip 7\p@ \@plus2\p@ \@minus2\p@
        \belowdisplayskip \abovedisplayskip
        \abovedisplayshortskip\abovedisplayskip
        \belowdisplayshortskip\abovedisplayskip
        }
    \renewcommand\footnotesize{%
        \@setfontsize\footnotesize\@ixpt{11.5pt plus .3\p@ minus .3\p@}%
        \abovedisplayskip 6.5\p@ \@plus2\p@ \@minus2\p@
        \belowdisplayskip \abovedisplayskip
        \abovedisplayshortskip\abovedisplayskip
        \belowdisplayshortskip\abovedisplayskip
        }
    \renewcommand\scriptsize{\@setfontsize\scriptsize\@viiipt\@xpt}
    \renewcommand\tiny{\@setfontsize\tiny\@vipt\@viipt}
    \renewcommand\large{\@setfontsize\large\@xiipt{15}}
    \renewcommand\Large{\@setfontsize\Large\@xivpt{17}}
    \renewcommand\LARGE{\@setfontsize\LARGE\@xvipt{20}}
    \renewcommand\huge{\@setfontsize\huge\@xviiipt{23}}
    \renewcommand\Huge{\@setfontsize\Huge\@xxpt{25}}
    %
    \setlength{\paperwidth}{6.75in}
    \setlength{\paperheight}{10in}
    \set@page@layout{405bp}{626pt}% 45 lines
    \advance\topmargin by -7.5\p@
    \ifpdf
        \ifdefined\pdfpagewidth
            \pdfpagewidth=\paperwidth
            \pdfpageheight=\paperheight
        \else
            \pagewidth=\paperwidth
            \pageheight=\paperheight
        \fi
    \else
        \AtBeginDvi{\special{papersize=\the\paperwidth, \the\paperheight}}
    \fi
    \setpkgattr{title}{skip}{30\p@}
 \fi
 %
 % Set up parameters for the BA journal layout:
 \if@balayout
    \RequirePackage{lmodern}
    \usepackage[T1]{fontenc}
    \UseLegacyTextSymbols% for text bullets, etc.
    % make lmex10 scalable
    \DeclareFontFamily{OMX}{lmex}{}
    \DeclareFontShape{OMX}{lmex}{m}{n}{%
@ -2711,7 +2898,7 @@
        \@namedef{specialsection*}{\section*}%
        \def\stitle@fmt#1{#1}%
        % Supplement case for BA:
-        % macroses: \stitle, \slink[doi], \sdatatype, \sfilename.
+        % macros: \stitle, \slink[doi], \sdatatype, \sfilename.
        \def\stitlepost#1{\gdef\@stitlepost{#1}}%
        \stitlepost{\space}%
        \def\slink@doi@fmt{%
@ -3336,29 +3523,16 @@
 % Set up parameters for the BJ journal layout:
 %
 \if@bjlayout
-    \IfFileExists{newtxmath.sty}%
+    \set@imsart@newtx
-        {\edef\new@tx@math@sty@exists{1}}%
+    \def\orig@amsthm@openbox{%
-        {}
+        \leavevmode
-    \RequirePackage[T1]{fontenc}
+        \hbox to.77778em{%
-    \def\rmdefault{utm}
+            \hfil\vrule
-    \ifdefined\new@tx@math@sty@exists
+            \vbox to.675em{\hrule width.6em\vfil\hrule}%
-        \def\ttdefault{ucr}
+            \vrule\hfil
-    \else
+            }%
-        \def\ttdefault{pcr}% ucr in old systems are somewhat buggy
+        }
-    \fi
+    \let\openbox\orig@amsthm@openbox
    \RequirePackage[scaled=0.9]{helvet}
    \RequirePackage{textcomp}
    \ifdefined\HCode
    \else
        \ifdefined\new@tx@math@sty@exists
            \RequirePackage[cmintegrals,bigdelims]{newtxmath}
        \else
            \PackageWarning{imsart}{newtxmath package isn't available, 
                amsmath is used instead}%
            \RequirePackage{amsmath}
        \fi
    \fi
    \let\openbox\relax
    \RequirePackage{graphicx}%
    \RequirePackage{letterspace}%
    \def\setlstracking#1{\csdef{MT@letterspace}{#1}}%
@ -3436,7 +3610,7 @@
        \belowdisplayskip=\abovedisplayskip
        \belowdisplayshortskip=\abovedisplayshortskip
        }
-    \normalsize
+    \normalfont\normalsize
    \renewcommand\small{%
        \@setfontsize\small\@ixpt{11\p@ plus .2\p@ minus .2\p@}%
        \abovedisplayskip=7.5\p@ \@plus4\p@ \@minus1\p@
@ -3533,24 +3707,19 @@
        }
    %
    \AtBeginDocument{%
-        \ifcsundef{jobstart@date@info}%
+        \def\author@sep@by@number#1{%
-            {\xdef\jobstart@date@info{\the\year\two@digits{\the\month}\two@digits{\the\day}}}%
+            \@tempcntb=\author@num
-            {}%
+            \advance\@tempcntb by \m@ne
-        \ifnum\jobstart@date@info>20211202\relax
+            \ifnum#1=\@tempcntb
-            \def\author@sep@by@number#1{%
+                \unskip\unkern\ \and
-                \@tempcntb=\author@num
+            \else
-                \advance\@tempcntb by \m@ne
+                \ifnum#1=\author@num
                \ifnum#1=\@tempcntb
                    \unskip\unkern\ \and
                \else
-                    \ifnum#1=\author@num
+                    \unskip\unkern,
                    \else
                        \unskip\unkern,
                    \fi
                \fi
-                \ignorespaces
+            \fi
-                }%
+            \ignorespaces
-        \fi
+            }%
        }
    % invoke \printhistory at end of document:
    \let\old@enddocument\enddocument
@ -3965,16 +4134,19 @@
    \@ifundefined{hy@subject}{}{\pdfstringdef\@pdfsubject{\hy@subject}}%
    \@ifundefined{hy@keywords}{}{\pdfstringdef\@pdfkeywords{\hy@keywords}}%
    %
-    \@ifundefined{user@hy@title}{}{\global\let\@pdftitle\user@hy@title}%
+    \@ifundefined{user@hy@title}{}{\pdfstringdef\@pdftitle\user@hy@title}%
-    \@ifundefined{user@hy@author}{}{\global\let\@pdfauthor\user@hy@author}%
+    \@ifundefined{user@hy@author}{}{\pdfstringdef\@pdfauthor\user@hy@author}%
-    \@ifundefined{user@hy@subject}{}{\global\let\@pdfsubject\user@hy@subject}%
+    \@ifundefined{user@hy@subject}{}{\pdfstringdef\@pdfsubject\user@hy@subject}%
-    \@ifundefined{user@hy@keywords}{}{\global\let\@pdfkeywords\user@hy@keywords}%
+    \@ifundefined{user@hy@keywords}{}{\pdfstringdef\@pdfkeywords\user@hy@keywords}%
    %
-    % MR with hyperef
+    % MR with hyperref
    \def\MR##1{\@MR##1 \relax{##1}}%
    %
    \if@print@bid@doi
-        \urlstyle{rm}%
+        \if@ejs@ps@ss@layout
        \else
            \urlstyle{rm}%
        \fi
        \let\ims@nolinkurl\nolinkurl
    \fi
    }
@ -3991,10 +4163,12 @@
    }
 %
 \newtoks\keywords@list
 \let\keywords@list@sep\@empty
 \def\addto@keywords@list#1{%
    \begingroup
        \no@harm
-        \xdef\@act{\global\noexpand\keywords@list{\the\keywords@list#1}}\@act
+        \xdef\@act{\global\noexpand\keywords@list{\the\keywords@list\keywords@list@sep#1}}\@act
        \xdef\keywords@list@sep{, }%
    \endgroup
    }
 %
@ -4390,15 +4564,6 @@
            {.}%
            {1em}%
            {}%
        \newtheoremstyle{remark}%
            {\medskipamount}%
            {\medskipamount}%
            {\normalfont}%
            {\parindent}%
            {\scshape}%
            {.}%
            {1em}%
            {}%
        %
        \renewenvironment{proof}[1][\proofname]%
            {%
@ -4425,15 +4590,6 @@
            {0.5em}%
            {\thmname{##1}\thmnumber{ ##2}\thmnote{ (##3)}}%
        \newtheoremstyle{definition}%
            {\medskipamount}%
            {\medskipamount}%
            {\itshape}%
            {\z@}%
            {\bfseries}%
            {.}%
            {0.5em}%
            {}%
        \newtheoremstyle{remark}%
            {\medskipamount}%
            {\medskipamount}%
            {\normalfont}%
@ -4471,7 +4627,7 @@
                \@endpefalse
            }%
        \if@bjpslayout
-            \newtheoremstyle{remark}%
+            \newtheoremstyle{definition}%
                {\smallskipamount}%
                {\smallskipamount}%
                {\normalfont}%
@ -4481,7 +4637,24 @@
                {.5em}%
                {}%
        \fi
-    \else
+    \fi
    \ifnum\ims@thmshape>\z@
        \def\th@remark@changeto{%
            \PackageError{amsthm}{change \string\theoremstyle{remark} to \string\theoremstyle{definition}}{}%
            }%
        \let\th@remark\undefined
        \renewcommand{\theoremstyle}[1]{%
            \@ifundefined{th@##1}%
                {%
                    \thm@style{plain}%
                    \@ifundefined{th@##1@changeto}%
                        {\PackageError{amsthm}{Unknown theoremstyle `##1'}{}}%
                        {\csname th@##1@changeto\endcsname}%
                }%
                {%
                    \thm@style{##1}%
                }%
            }%
    \fi
    }
 %
@ -4545,7 +4718,7 @@
        {%
            \vskip0.5\baselineskip
            \small
-            {\noindent\normalfont\sffamily\bfseries\acknowledgementsname}\par
+            {\noindent\normalfont\sffamily\bfseries #1}\par
            \begingroup\parindent 0pt\parskip 0.5\baselineskip
        }%
        {\endgroup}
@ -5019,6 +5192,7 @@
 %
 % appendix
 %
 \def\@appsectionstar@hook{}
 \if@imslayout
    %
    % appendix mess up everything in this layout:
@ -5042,7 +5216,8 @@
        \else
            \specialsection*{\appendixname: #1}%
        \fi
-        \setcounter{section}{1}%
+        \stepcounter{section}%
        \@appsectionstar@hook
        }
    %
    % \section{} -> APPENDIX A
@ -5074,7 +5249,8 @@
    %
    \def\@appsectionstar*#1{%
        \old@section*{#1}%
-        \setcounter{section}{1}%
+        \stepcounter{section}%
        \@appsectionstar@hook
        }
    %
    \def\@appsectionnostar#1{%
@ -5085,6 +5261,11 @@
        \fi
        }
 \fi
 \if@bjlayout
    \if@imsart@seceqn
        \def\@appsectionstar@hook{\setcounter{equation}{0}}
    \fi
 \fi
 %
 % supplement
 %
@ -5295,7 +5476,10 @@
        \definecolor{imsgray}{gray}{0.5}%
        }%
    %
-    \overfullrule5\p@
+    \if@ejs@ps@ss@layout
    \else
        \overfullrule=5\p@
    \fi
    \setpkgattr{showframe}{rulecolor}{imsgray}
    \setpkgattr{showframe}{rulewidth}{.05pt}
    %
@ -5312,14 +5496,12 @@
    \def\set@showframe@box{%
        \global\setbox\showframe@box\vbox to\z@{{%
            \offinterlineskip
            \normalcolor
            \vbox to \z@{\vss\textcolor{\showframe@rulecolor}{\hrule height\showframe@rulewidth width\textwidth}}%
            \vskip \headheight
            \vbox to \z@{\vss\textcolor{\showframe@rulecolor}{\hrule height\showframe@rulewidth width\textwidth}}%
            \vskip \headsep
            \vbox to \z@{\vss\textcolor{\showframe@rulecolor}{\hrule height\showframe@rulewidth width\textwidth}}%
-            \hbox to \textwidth%
+            \hbox to \textwidth{%
            {%
                \llap{\textcolor{\showframe@rulecolor}{\vrule height\textheight width\showframe@rulewidth}}%
                \hfil\textcolor{\showframe@rulecolor}{\vrule height\textheight width\showframe@rulewidth}%
            }%
@ -5336,6 +5518,30 @@
    %
 \fi
 %
 \if@load@xr@or@xr@hyper
    \def\@load@xr@or@xr@hyper@error@message@text{%
        Please do not use the 'xr' or 'xr-hyper' package.^^J%
        External references are not supported in our workflow.^^J%
        Replace all references to external documents with plain text%
        }
    \AtBeginDocument{%
        \@ifpackageloaded{xr}%
            {%
                \@latex@error{%
                    \@load@xr@or@xr@hyper@error@message@text
                    }{}%
            }%
            {}%
        \@ifpackageloaded{xr-hyper}%
            {%
                \@latex@error{%
                    \@load@xr@or@xr@hyper@error@message@text
                    }{}%
            }%
            {}%
        }
 \fi
 %
 % INITIALIZATION
 %
 % Read local configuration file (if exist):
--- a/AOS-accepted/main.tex
+++ b/AOS-accepted/main.tex
@ -2,16 +2,20 @@
 %% Packages
 \usepackage[utf8]{inputenc}
-\usepackage[LSF, T1]{fontenc}
+\usepackage[T1]{fontenc}
-% \usepackage{lmodern}  % TODO: interfers with "imsart" classed author address display
+\RequirePackage{amsthm,amsmath,amsfonts,amssymb}
-\usepackage{amsthm, amsmath, amsfonts, amssymb, bm, pifont}
+\usepackage{bm,pifont}
 \usepackage{float}
-\usepackage{chessfss}
+% \usepackage{chessfss}
 \usepackage{scalerel}
 \RequirePackage[authoryear]{natbib} % author-year citations
 \RequirePackage[
    colorlinks,
    citecolor=blue,
    urlcolor=blue
 ]{hyperref}                 % for coloring bibliography citations and linked URLs
 \RequirePackage{graphicx}   % for including figures
 \usepackage[dvipsnames]{xcolor}
 \usepackage{graphicx}
 \usepackage[authoryear]{natbib}
 \usepackage[colorlinks, citecolor = blue, urlcolor = blue]{hyperref}
 \usepackage[noabbrev, capitalize, nameinlink]{cleveref}     % after hyperref
@ -23,6 +27,7 @@
 %%                                          %%
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %\numberwithin{equation}{section}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%                                          %%
 %% For Axiom, Claim, Corollary, Hypothesis, %%
 %% Lemma, Theorem, Proposition              %%
@ -30,10 +35,10 @@
 %%                                          %%
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \theoremstyle{plain}
-\newtheorem{theorem}{Theorem}
+% \newtheorem{axiom}{Axiom}
-\newtheorem{lemma}{Lemma}
+% \newtheorem{claim}[axiom]{Claim}
-\newtheorem{corollary}{Corollary}
+\newtheorem{theorem}{Theorem}[section]
-\newtheorem{proposition}{Proposition}
+% \newtheorem{lemma}[theorem]{Lemma}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%                                          %%
 %% For Assumption, Definition, Example,     %%
@ -41,23 +46,18 @@
 %% use \theoremstyle{remark}                %%
 %%                                          %%
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\theoremstyle{remark}
+\theoremstyle{definition}
-\newtheorem{definition}{Definition}
+\newtheorem{definition}[theorem]{Definition}
-\newtheorem{condition}{Condition}
+% \newtheorem{condition}{Condition}
-\newtheorem{example}{Example}
+\newtheorem*{example}{Example}
-\newtheorem{remark}{Remark}
+\newtheorem*{remark}{Remark}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%% Please put your definitions here:        %%
+%% Our custom definitions:                  %%
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%% Clever ref additional reference name
+% typesetting of matrices, tensors and manifolds
 \crefname{condition}{Condition}{Conditions}
 \Crefname{condition}{Condition}{Conditions}
 \crefrangelabelformat{condition}{#3#1#4-#5#2#6}
 % matrices
 \newcommand*{\mat}[1]{\boldsymbol{#1}}
 % tensors (special case for lower case caligraphic letters)
 \newcommand*{\ten}[1]{
    \ifnum\pdfstrcmp{#1}{`}=1       % lowercase argument
    \mathfrak{#1}
@ -74,18 +74,15 @@
 \newcommand*{\rank}{\operatorname{rank}}
 \newcommand*{\diag}{\operatorname{diag}}
 \DeclareMathOperator{\tr}{tr}
 \DeclareMathOperator{\var}{Var}
 \DeclareMathOperator{\cov}{Cov}
 \DeclareMathOperator{\Span}{span}
-\DeclareMathOperator{\E}{\operatorname{\mathbb{E}}}
+\DeclareMathOperator{\E}{\mathbb{E}}
 \DeclareMathOperator*{\argmax}{{arg\,max}}
 \newcommand*{\D}{\textnormal{D}}        % derivative
 \renewcommand*{\H}{\textnormal{H}}      % hessian
 \renewcommand*{\d}{\textnormal{d}}      % differential
 \renewcommand*{\t}[1]{{#1^{T}}}         % matrix transpose
 \newcommand*{\pinv}[1]{{#1^{\dagger}}}  % `Moore-Penrose pseudoinverse`
 % rearangment operator, generalization of Van-Loan and Pitzianis rearrangement opreration
 \newcommand*{\K}{\mathcal{K}}
 \renewcommand{\checkmark}{{\color{Green}\ding{51}}}
 \newcommand{\xmark}{{\color{Red!70}\ding{55}}}
@ -158,13 +155,6 @@
 }
 \makeatother
 %%% "Fix" additional spacing around \left(...\right),
 % see: https://tex.stackexchange.com/questions/2607/spacing-around-left-and-right
 \let\originalleft\left
 \let\originalright\right
 \renewcommand{\left}{\mathopen{}\mathclose\bgroup\originalleft}
 \renewcommand{\right}{\aftergroup\egroup\originalright}
 \endlocaldefs
 \begin{document}
@ -262,7 +252,7 @@ Even though our motivation is rooted in the SDR perspective, our inverse regress
 The structure of this paper is as follows. We begin by introducing notation in \cref{sec:notation}, followed by a formal definition of the problem in \Cref{sec:problem-formulation}. The proposed model is specified in \cref{sec:gmlm-model}.
 \Cref{sec:manifolds} provides a brief introduction to manifolds, serving as the basis for the consistency and asymptotic normality results detailed in \cref{sec:statprop}. A general maximum likelihood estimation procedure is presented and we derive specialized methods for the multi-linear normal and multi-linear Ising distributions in \cref{sec:ml-estimation}.
-Simulations for continuous and binary tensor-valued predictors are carried out in \cref{sec:simulations}. We apply our model to EEG data, where the predictor takes the form of two- and three-dimensional arrays, as presented in \cref{sec:data-analysis}.
+Simulations for continuous and binary tensor-valued predictors are carried out in \cref{sec:simulations}. We apply our model to EEG data, where the predictor takes the form of two- and three-dimensional arrays, in \cref{sec:data-analysis}.
 Finally, we summarize our contributions and highlight potential directions for future research in \cref{sec:discussion}.
@ -369,7 +359,7 @@ Writing the density in \eqref{eq:quad-density} in terms of the new parameters $\
 where $\mat{\eta}_y = \mat{\eta}_y(\overline{\ten{\eta}}$, $\mat{\beta}_1$, $\ldots$, $\mat{\beta}_r$, $\mat{\Omega}_1$, $\ldots$, $\mat{\Omega}_r)$ is a well defined function.  
 The density of $\ten{X}$ given $Y$ in \eqref{eq:gmlm-density} is now indexed by these new parameters and sets the problem in the framework of generalized linear modeling based on a mode-wise linear relation between the predictors $\ten{X}$ and the response $Y$, which we call the \emph{Generalized Multi-Linear Model} (GMLM). Under the GMLM inverse regression model, a sufficient reduction for the forward regression of $Y$ on $\ten{X}$ is given in \cref{thm:sdr}.
-\begin{theorem}[\hyperlink{proof:sdr}{SDR}]\label{thm:sdr}
+\begin{theorem}[SDR]\label{thm:sdr}
    A sufficient reduction for the regression $Y\mid \ten{X}$ under the quadratic exponential family inverse regression model \eqref{eq:gmlm-density} is
    \begin{equation}\label{eq:sdr}
        \ten{R}(\ten{X}) = (\ten{X} - \E\ten{X})\mlm_{k = 1}^{r}\t{\mat{\beta}_j}.
@ -377,7 +367,7 @@ The density of $\ten{X}$ given $Y$ in \eqref{eq:gmlm-density} is now indexed by
    The reduction \eqref{eq:sdr} is minimal if all $\mat{\beta}_j$ are full rank for $j=1,\ldots,r$.
 \end{theorem}
-The reduction \eqref{eq:sdr} in vectorized form is $\vec\ten{R}(\ten{X})=\t{\mat{B}}\vec(\ten{X} - \E\ten{X})$. \cref{thm:sdr} shows that the \emph{sufficient reduction} $\ten{R}(\ten{X})$ reduces $\ten{X}$ along each mode (dimension) linearly. The graph in \cref{fig:SDRvisual} is a visual representation of the sufficient reduction for a $3$-dimensional tensor-valued predictor. We  provide the simplified version of the GMLM model and the sufficient reduction in \crefrange{ex:vector-valued}{ex:matrix-valued} for the special cases of vector- and matrix-valued predictors.
+The reduction \eqref{eq:sdr} in vectorized form is $\vec\ten{R}(\ten{X})=\t{\mat{B}}\vec(\ten{X} - \E\ten{X})$. \cref{thm:sdr} shows that the \emph{sufficient reduction} $\ten{R}(\ten{X})$ reduces $\ten{X}$ along each mode (dimension) linearly. The graph in \cref{fig:SDRvisual} is a visual representation of the sufficient reduction for a $3$-dimensional tensor-valued predictor. We provide the simplified version of the GMLM model and the sufficient reduction for the special cases of vector- and matrix-valued predictors in the following examples.
 \begin{figure}[!hpt]
    \centering
@ -385,7 +375,7 @@ The reduction \eqref{eq:sdr} in vectorized form is $\vec\ten{R}(\ten{X})=\t{\mat
    \caption{\label{fig:SDRvisual}Visual depiction of the sufficient reduction in \cref{thm:sdr}.}
 \end{figure}
-\begin{example}[Vector valued $\mat{x}$ ($r = 1$)]\label{ex:vector-valued}
+\begin{example}[Vector valued $\mat{x}$ ($r = 1$)]
    For a vector-valued predictor $\mat{X}\in\mathbb{R}^{p_1}$, the density \eqref{eq:gmlm-density} reduces to
    \begin{align*}
        f(\mat{x}\mid Y = y) &= h(\mat{x})\exp(\langle\mat{x}, \overline{\mat{\eta}} + \mat{\beta}_1\mat{f}_y\rangle + c\langle\mat{x}, \mat{\Omega}_1\mat{x}\rangle - b(\mat{\eta}_y)) \\
@ -394,7 +384,7 @@ The reduction \eqref{eq:sdr} in vectorized form is $\vec\ten{R}(\ten{X})=\t{\mat
    where $\mat{f}_y\in\mathbb{R}^{q_1}$ is vector-valued as well. The sufficient reduction obtained by \cref{thm:sdr} is then $\mat{R}(\mat{x}) = \t{\mat{\beta}_1}(\mat{x} - \E\mat{X})\in\mathbb{R}^{q_1}$ and $\mat{B} = \mat{\beta}_1\in\mathbb{R}^{p_1\times q_1}$.
 \end{example}
-\begin{example}[Matrix-valued $\mat{X}$ ($r = 2$)]\label{ex:matrix-valued}
+\begin{example}[Matrix-valued $\mat{X}$ ($r = 2$)]
    Assuming $\mat{X}$ is matrix-valued, which requires $\mat{F}_Y\in\mathbb{R}^{q_1\times q_2}$ to also be matrix-valued. Then, the density \eqref{eq:gmlm-density} has the form
    \begin{align*}
        f(\mat{x}\mid Y = y)
@ -409,13 +399,13 @@ The reduction \eqref{eq:sdr} in vectorized form is $\vec\ten{R}(\ten{X})=\t{\mat
 \section{Manifolds and Parameter Spaces}\label{sec:manifolds}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\cref{thm:sdr} finds the sufficient reduction for the regression of $Y$ on $\ten{X}$ in the population under the inverse GMLM \eqref{eq:gmlm-density}. In practice we need to estimate the mode-wise reduction matrices $\mat{\beta}_j$  in the GMLM \eqref{eq:gmlm-density}. As we operate within the framework of the exponential family, we opt for maximum likelihood estimation (MLE). In a classic generalized linear model setting, this is straightforward and yields well defined MLEs. In our setting, though, a major problem is due to the fact that our GMLM parameterization in \eqref{eq:gmlm-density} is \emph{not} identifiable. This is a direct consequence of the identity $\mat{\beta}_2\otimes\mat{\beta}_1 = (a\mat{\beta}_2)\otimes (a^{-1}\mat{\beta}_1)$ for any $a\neq 0$ (the same holds for the $\mat{\Omega}_j$s) so that different parameterizations of \eqref{eq:gmlm-density} describe the same density. In other words, we do \emph{not} have a one-to-one relation between the parameters and the GMLM and consistency cannot be established. Without consistency, derivation of the asymptotic distribution for the maximum likelihood estimator becomes infeasible.
+\cref{thm:sdr} finds the sufficient reduction for the regression of $Y$ on $\ten{X}$ in the population under the inverse GMLM \eqref{eq:gmlm-density}. In practice, we need to estimate the mode-wise reduction matrices $\mat{\beta}_j$ in the GMLM \eqref{eq:gmlm-density}. As we operate within the framework of the exponential family, we opt for maximum likelihood estimation (MLE). In a classic generalized linear model setting, this is straightforward and yields well defined MLEs. In our setting, though, a major problem is due to the fact that our GMLM parameterization in \eqref{eq:gmlm-density} is \emph{not} identifiable. This is a direct consequence of the identity $\mat{\beta}_2\otimes\mat{\beta}_1 = (a\mat{\beta}_2)\otimes (a^{-1}\mat{\beta}_1)$ for any $a\neq 0$ (the same holds for the $\mat{\Omega}_j$s) so that different parameterizations of \eqref{eq:gmlm-density} describe the same density. In other words, we do \emph{not} have a one-to-one relation between the parameters and the GMLM and consistency cannot be established. Without consistency, derivation of the asymptotic distribution for the maximum likelihood estimator becomes infeasible.
 To resolve this issue, we need to disambiguate the GMLM parameters to reestablish a one-to-one relation to the model. At the same time, we want to keep the mode-wise parameters $\mat{\beta}_j$ in \eqref{eq:gmlm-density} as those are the mode-wise reduction matrices needed by \cref{thm:sdr}. Using the mode-wise GMLM parameters has further advantages: the total number of parameters to be estimated is much smaller in many settings. Specifically, in the case of the multi-linear normal, a very efficient estimation algorithm is applicable. Moreover, the required number of observations for a reliable estimate is very small, potentially even smaller than any of the axis-dimensions $p_j$. We also gain significant estimation accuracy.
 In the derivation of the GMLM we first introduced the parameters $\overline{\eta}$ and $\mat{B}$,  which models a linear relation between $\ten{X}$ and $Y$ through $\mat{\eta}_{1y}$ in \eqref{eq:eta1-manifold}, and  the symmetric matrix $\mat{\Omega}$ to replace $\mat{\eta}_2$ in \eqref{eq:eta2-manifold}. Then, we modeled $\mat{B}$ using the mode-wise component matrices $\mat{\beta}_j$ which impose a non-linear constraint on $\mat{B} = \bigotimes_{j = r}^1\mat{\beta}_j$. Similarly, the introduction of the $\mat{\Omega}_j$'s in $\mat{\Omega} = \bigotimes_{j = r}^1 \mat{\Omega}_j$ constrains nonlinearly $\mat{\Omega}$. Both unconstrained $\mat{B}$ and $\vech(\mat{\Omega})$ are identifiable: The GMLM density \eqref{eq:gmlm-density} corresponds to one and only one  $\mat{B}$ and $\vech(\mat{\Omega})$. Additionally, given any $\mat{\beta}_j$'s and $\mat{\Omega}_j$'s, then $\mat{B}$ and $\vech(\mat{\Omega})$ are uniquely determined. Based on these observations we derive the asymptotic behavior of the parameters $\mat{B}$ and $\vech{\mat{\Omega}}$ while operating with their components $\mat{\beta}_j$ and $\mat{\Omega}_j$. As a result we obtain a parameter space $\Theta$ with a non-linear constraint.
-Except for identifiable parameters, asymptotic normality (see \cref{thm:asymptotic-normality-gmlm} in  \cref{sec:statprop})  requires differentiation. Therefore, the space itself must admit the definition of differentiation, which is usually a vector space. This is too strong an assumption for our purposes. To weaken the vector space assumption, we consider \emph{smooth manifolds}. These are spaces that look like Euclidean spaces locally and allow the notion of differentiation. The more general \emph{topological} manifolds are too weak for differentiation. A smooth manifold only allows only for first derivatives. Without going into details, the solution is a \emph{Riemannian manifold} \cite[]{Lee2012,Lee2018,AbsilEtAl2007}. Similar to an abstract \emph{smooth manifold}, Riemannian manifolds are detached from our usual intuition as well as are complicated to handle. This is where an \emph{embedded (sub)manifold} comes to the rescue. Simply speaking, an embedded manifold is a manifold that is a subset of a manifold from which it inherits its properties. If a manifold is embedded in a Euclidean space, almost all the complications of abstract manifold theory simplify drastically. Moreover, since an Euclidean space is itself a Riemannian manifold, we inherit the means for higher derivatives. Finally, a smooth embedded submanifold structure for the parameter space maintains consistency with existing approaches and results for parameter sets with linear subspace structure. These reasons justify the constraint that the parameter space $\Theta$ be a \emph{smooth embedded manifold} in a Euclidean space.
+Except for identifiable parameters, asymptotic normality (see \cref{thm:asymptotic-normality-gmlm} in  \cref{sec:statprop})  requires differentiation. Therefore, the space itself must admit the definition of differentiation, which is usually a vector space. This is too strong an assumption for our purposes. To weaken the vector space assumption, we consider \emph{smooth manifolds}. These are spaces that resemble Euclidean spaces locally and allow the notion of differentiation. The more general \emph{topological} manifolds are too weak for differentiation. A smooth manifold  allows only for first derivatives. Without going into details, the solution is a \emph{Riemannian manifold} \cite[]{Lee2012,Lee2018,AbsilEtAl2007}. Similar to an abstract \emph{smooth manifold}, Riemannian manifolds are detached from our usual intuition. They are also complicated to handle. This is where an \emph{embedded (sub)manifold} comes to the rescue. Simply speaking, an embedded manifold is a manifold that is a subset of a manifold from which it inherits its properties. If a manifold is embedded in a Euclidean space, almost all the complications of abstract manifold theory simplify drastically. Moreover, since an Euclidean space is itself a Riemannian manifold, we inherit the means for higher derivatives. Finally, a smooth embedded submanifold structure for the parameter space maintains consistency with existing approaches and results for parameter sets with linear subspace structure. These reasons justify the constraint that the parameter space $\Theta$ be a \emph{smooth embedded manifold} in a Euclidean space.
 We now define a \emph{smooth manifold} embedded in $\mathbb{R}^p$, avoiding unnecessary detours into the more general theory (see \cite{Kaltenbaeck2021}).
@ -429,7 +419,7 @@ We now define a \emph{smooth manifold} embedded in $\mathbb{R}^p$, avoiding unne
 As a basis to ensure that the constrained parameter space $\Theta$ is a manifold, without having to check every case separately, we provide two simple criteria that can be used to construct manifolds obeying the Kronecker product constraint. We need the concept of a \emph{spherical} set, which is a set $\manifold{A}$, on which the Frobenius norm, $\|\,.\,\|_F:\manifold{A}\to\mathbb{R}$, is constant. Also, we call a scale invariant set $\manifold{A}$ a \emph{cone}; that is, $\manifold{A} = \{ c \mat{A} : \mat{A}\in\manifold{A} \}$ for all $c > 0$.
-\begin{theorem}[\hyperlink{proof:kron-manifolds}{Kronecker Product Manifolds}]\label{thm:kron-manifolds}
+\begin{theorem}[Kronecker Product Manifolds]\label{thm:kron-manifolds}
    Let $\manifold{A}\subseteq\mathbb{R}^{p_1\times q_1}\backslash\{\mat{0}\}, \manifold{B}\subseteq\mathbb{R}^{p_2\times q_2}\backslash\{\mat{0}\}$ be smooth embedded submanifolds. Assume one of the following conditions holds:
    \begin{itemize}
        \item[-] ``sphere condition'':
@ -442,7 +432,7 @@ As a basis to ensure that the constrained parameter space $\Theta$ is a manifold
 With \cref{thm:kron-manifolds} we can obtain sufficient conditions for the construction of a constrained parameter manifold.
-\begin{theorem}[\hyperlink{proof:param-manifold}{Parameter Manifolds}]\label{thm:param-manifold}
+\begin{theorem}[Parameter Manifolds]\label{thm:param-manifold}
    Let
    \begin{displaymath}
        \manifold{K}_{\mat{B}} = \Bigl\{ \bigkron_{k = r}^{1}\mat{\beta}_k : \mat{\beta}_k\in\manifold{B}_k \Bigr\}
@ -518,7 +508,7 @@ In a classical \emph{generalized linear model} (GLM), the link function connecti
 Gradient descent is a powerful and widely used optimization algorithm to compute MLEs. We compute the gradients of $l_n$ in \cref{thm:grad}.
-\begin{theorem}[\hyperlink{proof:grad}{Likelihood Gradient}]\label{thm:grad}
+\begin{theorem}[Likelihood Gradient]\label{thm:grad}
    Suppose $(\ten{X}_i, y_i), i = 1, ..., n$, are i.i.d. with conditional log-likelihood of the form \eqref{eq:log-likelihood}, where $\mat{\theta}$ denotes the collection of all GMLM parameters $\overline{\ten{\eta}}$, ${\mat{B}} = \bigkron_{k = r}^{1}{\mat{\beta}}_k$ and ${\mat{\Omega}} = \bigkron_{k = r}^{1}{\mat{\Omega}}_k$ for $k = 1, ..., r$. Then, the partial gradients with respect to $\overline{\ten{\eta}}, \mat{\beta}_1, \ldots, \mat{\beta}_r, \mat{\Omega}_1, \ldots, \mat{\Omega}_r$ are given by
    \begin{align*}
        \nabla_{\overline{\ten{\eta}}}l_n &\equiv \frac{1}{n}\sum_{i = 1}^n (\ten{X}_i - \E_{\mat{\theta}}[\ten{X} \mid Y = y_i]), \\
@ -588,9 +578,9 @@ so that
 resulting in the estimates $\hat{\mat{\Omega}}_j = (\tilde{s}\tilde{\mat{\Sigma}}_j)^{-1}$.
 Estimation is performed by updating the estimates $\hat{\mat{\beta}}_j$ via \eqref{eq:tensor_normal_beta_solution} for $j = 1, \ldots, r$, and then recompute the $\hat{\mat{\Omega}}_j$ estimates simultaneously keeping the $\hat{\mat{\beta}}_j$s fixed. This procedure is repeated until convergence.
-A technical detail for numerical stability is to ensure that the scaled values $\tilde{s}\tilde{\mat{\Sigma}}_j$, assumed to be symmetric and positive definite, are well conditioned. Thus, we estimate the condition number of $\tilde{s}\tilde{\mat{\Sigma}}_j$ before computing the inverse. In case of ill-conditioning, we use the regularized $\hat{\mat{\Omega}}_j = (\tilde{s}\tilde{\mat{\Sigma}}_j + 0.2 \lambda_{1}(\tilde{s}\tilde{\mat{\Sigma}}_j)\mat{I}_{p_j})^{-1}$ instead, where $\lambda_{1}(\tilde{s}\tilde{\mat{\Sigma}}_j)$ is the first (maximum) eigenvalue. Experiments showed that this regularization is usually only required in the first few iterations.
+A technical detail for numerical stability is to ensure that the scaled values $\tilde{s}\tilde{\mat{\Sigma}}_j$, assumed to be symmetric and positive definite, are well conditioned. Thus, we estimate the condition number of $\tilde{s}\tilde{\mat{\Sigma}}_j$ before computing the inverse. In case of ill-conditioning, we use the \emph{regularized} $\hat{\mat{\Omega}}_j = (\tilde{s}\tilde{\mat{\Sigma}}_j + 0.2 \lambda_{1}(\tilde{s}\tilde{\mat{\Sigma}}_j)\mat{I}_{p_j})^{-1}$ instead, where $\lambda_{1}(\tilde{s}\tilde{\mat{\Sigma}}_j)$ is the first (maximum) eigenvalue. Experiments showed that this regularization is usually only required in the first few iterations.
-If the parameter space follows a more general setting as in \cref{thm:param-manifold}, updating may produce estimates outside the parameter space. A simple and efficient method is to project every updated estimate onto the corresponding manifold.
+If the parameter space follows a more general setting, as in \cref{thm:param-manifold}, updating may produce estimates outside the parameter space. A simple and efficient method is to project every updated estimate onto the corresponding manifold.
 A standard algorithm to calculate the MLE of a Kronecker product is block-coordinate descent,  proposed independently by \cite{MardiaGoodall1993} and \cite{Dutilleul1999}. It was later called ``flip-flop'' algorithm by \cite{LuZimmerman2005} for the computation of the maximum likelihood estimators of the components of a separable covariance matrix. \cite{ManceurDutilleul2013} extended the ``flip-flop'' algorithm for the computation of the MLE of the separable covariance structure of a 3-way and 4-way normal distribution and obtained a lower bound for the sample size required for its existence. The same issue was also studied by \cite{DrtonEtAl2020} in the case of a two-way array (matrix). Our algorithm uses a similar ``flip-flop'' approach by iteratively updating the $\mat{\beta}_k$'s and $\mat{\Omega}_k$'s.
@ -654,7 +644,7 @@ If  the objective function $M_n$ is the log-likelihood,  $\widehat{\mat{\theta}}
    is called a \emph{strong M-estimator} over $\Theta$. Replacing $o_P(n^{-1})$ by $o_P(1)$ gives a \emph{weak M-estimator}.
 \end{definition}
-\begin{theorem}[\hyperlink{proof:asymptotic-normality-gmlm}{Asymptotic Normality}]\label{thm:asymptotic-normality-gmlm}
+\begin{theorem}[Asymptotic Normality]\label{thm:asymptotic-normality-gmlm}
    Assume $Z = (\ten{X}, Y)$ satisfies model \eqref{eq:quad-density} subject to \eqref{eq:eta1-manifold} and \eqref{eq:eta2-manifold} with true constrained parameter $\mat{\theta}_0 = (\overline{\eta}_0, \mat{B}_0, \vech{\mat{\Omega}_0})\in\Theta$, where $\Theta$ is defined in \cref{thm:param-manifold}. Under the regularity Conditions 1--3 in Appendix~B in the supplementary material, there exists a strong M-estimator sequence $\hat{\mat{\theta}}_n$ deriving from $l_n$ in \eqref{eq:log-likelihood} over $\Theta$. Furthermore, any strong M-estimator $\hat{\mat{\theta}}_n$ converges in probability to the true parameter $\mat{\theta}_0$, $\hat{\mat{\theta}}_n\xrightarrow{p}\mat{\theta}_0$, over $\Theta$. Moreover, every strong M-estimator $\hat{\mat{\theta}}_n$ is asymptotically normal,
    \begin{displaymath}
        \sqrt{n}(\hat{\mat{\theta}}_n - \mat{\theta}_0) \xrightarrow{d} \mathcal{N}(0, \mat{\Sigma}_{\mat{\theta}_0})
@ -665,7 +655,7 @@ If  the objective function $M_n$ is the log-likelihood,  $\widehat{\mat{\theta}}
 To provide an intuition for the asymptotic variance-covariance structure $\mat{\Sigma}_{\mat{\theta}_0}$, we start from the classical, non-degenerate setting of an MLE $\hat{\mat{\xi}}_n$ in an unconstrained parameter space $\Xi$ containing the true parameter $\mat{\xi}_0$. In this case, $\mat{\Sigma}_{\mat{\xi}_0}$ is symmetric positive definite. Such matrices can be associated with a hyper-ellipsoid with axes associated with the eigenvectors of $\mat{\Sigma}_{\mat{\xi}_0}$. Given the manifold parameter space $\Theta\subseteq\Xi$ with true parameter $\mat{\theta}_0 = \mat{\xi}_0$, the asymptotic variance-covariance $\mat{\Sigma}_{\mat{\theta}_0}$ is a positive semi-definite matrix associated with a (degenerate) hyper-ellipsoid resulting from intersecting the hyper-ellipsoid of $\mat{\Sigma}_{\mat{\xi}_0}$ with the tangent space of $\Theta$ at $\mat{\theta}_0 = \mat{\xi}_0$ and distorting its shape with respect to the local curvature of $\Theta$ at $\mat{\theta}_0$.
 \begin{remark}
-    \cref{thm:asymptotic-normality-gmlm} is a special case of a more general asymptotic normality Theorem~6 that also generalizes Theorem~5.23 in \cite{vanderVaart1998}, where $\Theta$ is an open subset of an Euclidean space, which is the simplest form of an embedded manifold. Theorem~6 is provided in Appendix~B in the supplementary material due to its technical nature.
+    \cref{thm:asymptotic-normality-gmlm} is a special case of a more general asymptotic normality Theorem~B.4 that also generalizes Theorem~5.23 in \cite{vanderVaart1998}, where $\Theta$ is an open subset of an Euclidean space, which is the simplest form of an embedded manifold. Theorem~B.4 is rather technical. It is stated and shown in Appendix~B in the supplementary material.
 \end{remark}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -673,11 +663,11 @@ To provide an intuition for the asymptotic variance-covariance structure $\mat{\
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 In this section, we report simulation results for the multi-linear normal and the multi-linear Ising model where different aspects of the GMLM model are compared against other methods. These are: \textit{Tensor Sliced Inverse Regression} (TSIR) \cite[]{DingCook2015},  an extension of Sliced Inverse Regression (SIR) \cite{Li1991} to tensor-valued predictors; the \textit{Multiway Generalized Canonical Correlation Analysis} (MGCCA) \cite[]{ChenEtAl2021,GirkaEtAl2024}, an extension of canonical correlation analysis (CCA) designed to handle multi-block data with tensor  structure; and the Tucker decomposition that is a higher-order form of principal component analysis (HOPCA) \cite[]{KoldaBader2009}, for both continuous and binary data. For the latter, the binary values are treated as continuous. As part of our baseline analysis, we also incorporate traditional Principal Component Analysis (PCA) on vectorized observations. In the case of the Ising model, we also compare with LPCA (Logistic PCA) and CLPCA (Convex Logistic PCA), both introduced in \cite{LandgrafLee2020}. All experiments are performed with sample sizes $n = 100, 200, 300, 500$ and $750$. Each experiment is repeated $100$ times.
-To assess the accuracy of the estimation of  $\ten{R}(\ten{X})$ in \cref{thm:sdr}, we compare the estimate with the true vectorized reduction matrix $\mat{B} = \bigkron_{k = r}^{1}\mat{\beta}_k$, as it is compatible with any linear reduction method. We compute the \emph{subspace distance}, $d(\mat{B}, \hat{\mat{B}})$, between  $\mat{B}\in\mathbb{R}^{p\times q}$ and an estimate $\hat{\mat{B}}\in\mathbb{R}^{p\times \tilde{q}}$, which satisfies 
+To assess the accuracy of the estimation of  $\ten{R}(\ten{X})$ in \cref{thm:sdr}, we compare the estimate with the true vectorized reduction matrix $\mat{B} = \bigkron_{k = r}^{1}\mat{\beta}_k$, as it is compatible with any linear reduction method. We compute the \emph{subspace distance}, $d(\mat{B}, \hat{\mat{B}})$, between $\mat{B}\in\mathbb{R}^{p\times q}$ and an estimate $\hat{\mat{B}}\in\mathbb{R}^{p\times \tilde{q}}$, which satisfies
 \begin{displaymath}
    d(\mat{B}, \hat{\mat{B}}) \propto \| \mat{B}\pinv{(\t{\mat{B}}\mat{B})}\t{\mat{B}} - \hat{\mat{B}}\pinv{(\t{\hat{\mat{B}}}\hat{\mat{B}})}\t{\hat{\mat{B}}} \|_F,
 \end{displaymath}
-where $\propto$ signifies proportional to. The proportionality constant\footnote{The proportionality constant depends on the dimension $p$ and the ranks of $\mat{B}$ and $\hat{\mat{B}}$. The explicit value of the proportionality constant is given by $(\min(\rank\mat{B} + \rank\hat{\mat{B}}, 2 p - (\rank\mat{B} + \rank\hat{\mat{B}})))^{-1/2}$.}  ensures  $d(\mat{B}, \hat{\mat{B}}) \in [0, 1]$. A distance of zero implies space overlap and a distance of one implies orthogonality of the subspaces.
+where $\propto$ signifies proportional to. The proportionality constant\footnote{The proportionality constant depends on the dimension $p$ and the ranks of $\mat{B}$ and $\hat{\mat{B}}$. The explicit value of the proportionality constant is given by $(\min(\rank\mat{B} + \rank\hat{\mat{B}}, 2 p - (\rank\mat{B} + \rank\hat{\mat{B}})))^{-1/2}$.} ensures $d(\mat{B}, \hat{\mat{B}}) \in [0, 1]$. A distance of zero implies space overlap and a distance of one implies orthogonality of the subspaces.
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -830,7 +820,7 @@ As a real data application on regressions with binary tensor-valued predictors,
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \section{Discussion}\label{sec:discussion}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-In this paper, we propose a generalized multi-linear model formulation for the inverse conditional distribution of a tensor-valued predictor given a response and derive a multi-linear sufficient reduction for the corresponding forward regression/classification problem. We also propose estimators for the sufficient reduction and show they are consistent and asymptotically normal. We demonstrate, through a numerical example in supplementary material Appendix~C, the modeling benefits of leveraging the tensor structure of the data.
+In this paper, we propose a generalized multi-linear model formulation for the inverse conditional distribution of a tensor-valued predictor given a response and derive a multi-linear sufficient reduction for the corresponding forward regression/classification problem. We also propose estimators for the sufficient reduction and show they are consistent and asymptotically normal. We demonstrate the modeling benefits of leveraging the tensor structure of the data with a numerical example in Appendix~C of the supplementary material.
 Obtaining the asymptotic results required leveraging manifolds as a basis for resolving the issue of unidentifiable parameters. This in turn led to an even more flexible modeling framework, which allows building complex and potentially problem-specific parameter spaces that incorporate additional domain-specific knowledge into the model.
 We allude to this feature of our approach in \cref{sec:matrix-manifolds}, where we also tabulate different matrix manifolds that can be used as building blocks $\manifold{B}_k$ and $\manifold{O}_k$ of the parameter space in \cref{tab:matrix-manifolds}. For example, our formulation can easily accommodate longitudinal data tabulated in matrix format, where the rows are covariates and the columns are consecutive time points with discrete AR($k$) dependence structure. 
--- a/AOS-accepted/plots/psqt.pdf
+++ b/AOS-accepted/plots/psqt.pdf
--- a/AOS-accepted/plots/sim-ising-perft-m2.pdf
+++ b/AOS-accepted/plots/sim-ising-perft-m2.pdf
--- a/AOS-accepted/plots/sim-ising.pdf
+++ b/AOS-accepted/plots/sim-ising.pdf
--- a/AOS-accepted/plots/sim-normal-2x3.pdf
+++ b/AOS-accepted/plots/sim-normal-2x3.pdf
--- a/AOS-accepted/plots/sim_efficiency.pdf
+++ b/AOS-accepted/plots/sim_efficiency.pdf
--- a/AOS-accepted/plots/tsir_eeg_2d_bad_perf.pdf
+++ b/AOS-accepted/plots/tsir_eeg_2d_bad_perf.pdf
--- a/AOS-accepted/plots/tsir_regularized.pdf
+++ b/AOS-accepted/plots/tsir_regularized.pdf
--- a/AOS-accepted/supplement.tex
+++ b/AOS-accepted/supplement.tex
@ -1,25 +1,30 @@
-\documentclass[aos]{imsart}
+\documentclass[aos,supplement]{imsart}
 %% Packages
 \usepackage[utf8]{inputenc}
 \usepackage[LSF, T1]{fontenc}
-% \usepackage{lmodern}  % TODO: interfers with "imsart" classed author address display
+\RequirePackage{amsthm,amsmath,amsfonts,amssymb}
-\usepackage{amsthm, amsmath, amsfonts, amssymb, bm, pifont}
+\usepackage{bm,pifont}
 \usepackage{float}
 \usepackage{chessfss}
 \usepackage{scalerel}
 \RequirePackage[authoryear]{natbib} % author-year citations
 \usepackage{hyperref}
 \usepackage{xr-hyper}
 % \RequirePackage[
 %     colorlinks
 % ]{hyperref}                 % for coloring bibliography citations and linked URLs
 \hypersetup{
    colorlinks,
    citecolor={blue},
    urlcolor={blue},
    linkcolor={blue},
    filecolor={red!60!black}
 }
 \RequirePackage{graphicx}   % for including figures
 \usepackage[dvipsnames]{xcolor}
 \usepackage{graphicx}
 \usepackage[authoryear]{natbib}
 % External References to main file
 \usepackage{xr}
 \externaldocument{main}
 \usepackage[colorlinks, citecolor = blue, urlcolor = blue]{hyperref}
 \usepackage[noabbrev, capitalize, nameinlink]{cleveref}     % after hyperref
 \startlocaldefs
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%                                          %%
@ -35,10 +40,10 @@
 %%                                          %%
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \theoremstyle{plain}
-\newtheorem{theorem}{Theorem}
+% \newtheorem{axiom}{Axiom}
-\newtheorem{lemma}{Lemma}
+% \newtheorem{claim}[axiom]{Claim}
-\newtheorem{corollary}{Corollary}
+\newtheorem{theorem}{Theorem}[section]
-\newtheorem{proposition}{Proposition}
+\newtheorem{lemma}[theorem]{Lemma}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%                                          %%
 %% For Assumption, Definition, Example,     %%
@ -46,23 +51,25 @@
 %% use \theoremstyle{remark}                %%
 %%                                          %%
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\theoremstyle{remark}
+\theoremstyle{definition}
-\newtheorem{definition}{Definition}
+\newtheorem{definition}[theorem]{Definition}
 \newtheorem{condition}{Condition}
-\newtheorem{example}{Example}
+\newtheorem*{example}{Example}
-\newtheorem{remark}{Remark}
+% \newtheorem*{remark}{Remark}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%% Please put your definitions here:        %%
+%% Our custom definitions:                  %%
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %% Clever ref additional reference name
 \crefname{condition}{Condition}{Conditions}
 \Crefname{condition}{Condition}{Conditions}
 \crefrangelabelformat{condition}{#3#1#4-#5#2#6}
 \crefname{equation}{}{}
 \Crefname{equation}{}{}
-% matrices
+% typesetting of matrices, tensors and manifolds
 \newcommand*{\mat}[1]{\boldsymbol{#1}}
 % tensors (special case for lower case caligraphic letters)
 \newcommand*{\ten}[1]{
    \ifnum\pdfstrcmp{#1}{`}=1       % lowercase argument
    \mathfrak{#1}
@ -76,14 +83,11 @@
 \renewcommand{\hat}{\widehat}
 \renewcommand*{\vec}{\operatorname{vec}}
 \newcommand*{\vech}{\operatorname{vech}}
 \newcommand*{\rank}{\operatorname{rank}}
 \newcommand*{\diag}{\operatorname{diag}}
 \DeclareMathOperator{\tr}{tr}
 \DeclareMathOperator{\var}{Var}
 \DeclareMathOperator{\cov}{Cov}
 \DeclareMathOperator{\Span}{span}
 \DeclareMathOperator{\E}{\operatorname{\mathbb{E}}}
 \DeclareMathOperator*{\argmax}{{arg\,max}}
 \newcommand*{\D}{\textnormal{D}}        % derivative
 \renewcommand*{\H}{\textnormal{H}}      % hessian
 \renewcommand*{\d}{\textnormal{d}}      % differential
@ -94,22 +98,6 @@
 % ternary operator (C style argments: <condition> ? <val_if_true> : <val_if_false>)
 \newcommand{\ternary}[3]{{#2}{\ \mathrm{if}\ }{#1}{\ \mathrm{else}\ }{#3}}
 \renewcommand{\checkmark}{{\color{Green}\ding{51}}}
 \newcommand{\xmark}{{\color{Red!70}\ding{55}}}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \newcommand{\efi}[1]{{\color{teal}Effie: #1}}
 \newcommand{\daniel}[1]{{\color{red!70!black}Daniel: #1}}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 % Special Matrix Sets (Manifolds)
 \newcommand{\StiefelNonCompact}[2]{\mathbb{R}_{*}^{{#1}\times {#2}}}
 \newcommand{\Stiefel}[2]{\mathrm{St}^{{#1}\times {#2}}}
 \newcommand{\SymMat}[1]{\mathrm{Sym}^{{#1}\times {#1}}}
 \newcommand{\SymPosDefMat}[1]{\mathrm{Sym}_{++}^{{#1}\times {#1}}}
 \newcommand{\OrthogonalGrp}[1]{\mathrm{O}(#1)}
 \newcommand{\SpecialOrthogonalGrp}[1]{\mathrm{SO}(#1)}
 %%% Custom operators with ether one or two arguments (limits)
 \makeatletter
 %%% Multi-Linear Multiplication
@ -182,6 +170,8 @@
 \endlocaldefs
 % External References to main file
 \externaldocument{main}
 \begin{document}
@ -219,7 +209,7 @@
 %%% Set counters to continue from main paper counters (after frontmatter!)
 \setcounter{theorem}{5}
-\setcounter{definition}{2}
+% \setcounter{definition}{2}
 \setcounter{figure}{3}
 \setcounter{equation}{19}
 \setcounter{footnote}{10}
@ -305,100 +295,12 @@
 \end{example}
 % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 % \section{Multi Linear Algebra}\label{app:multi-linear-algebra}
 % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %
 % \begin{displaymath}
 %     (\ten{A}\circ\ten{B})\mlm_{k = 1}^{r + s} \mat{C}_k
 %     =
 %     \Bigl(\ten{A}\mlm_{k = 1}^r \mat{C}_k\Bigr)\circ\Bigl(\ten{B}\mlm_{l = 1}^s \mat{C}_{l + r}\Bigr)
 % \end{displaymath}
 % Using $\K(\ten{A}\circ\ten{B}) = \ten{A}\otimes\ten{B}$ gives
 % \begin{displaymath}
 %     \K\Bigl((\ten{A}\circ\ten{B})\mlm_{k = 1}^{r + s} \mat{C}_k\Bigr)
 %     =
 %     \Bigl(\ten{A}\mlm_{k = 1}^r \mat{C}_k\Bigr)\otimes\Bigl(\ten{B}\mlm_{l = 1}^s \mat{C}_{l + r}\Bigr)
 % \end{displaymath}
 % A generalization of the well-known identity $\vec(\mat{A}\mat{B}\mat{C}) = (\t{\mat{C}}\otimes\mat{A})\vec{\mat{B}}$ is given by
 % \begin{displaymath}
 %     \Bigl(\ten{A}\mlm_{k = 1}^r \mat{B}_k \Bigr)_{(\mat{i}, \mat{j})}
 %     =
 %     \Bigl( \bigotimes_{k = \#\mat{i}}^{1}\mat{B}_{\mat{i}_k} \Bigr)
 %         \ten{A}_{(\mat{i}, \mat{j})}
 %             \Bigl( \bigotimes_{l = \#\mat{j}}^{1}\t{\mat{B}_{\mat{j}_l}} \Bigr)
 % \end{displaymath}
 % with the special case
 % \begin{displaymath}
 %     \vec\Bigl(\ten{A}\mlm_{k = 1}^r \mat{B}_k\Bigr)
 %     =
 %     \Bigl(\bigotimes_{k = r}^{1}\mat{B}_k\Bigr)\vec{\ten{A}}
 % \end{displaymath}
 % Furthermore, we have
 % \begin{displaymath}
 %     (\ten{A}\otimes\ten{B})\mlm_{k = 1}^{r}\t{(\vec\mat{C}_k)}
 %     =
 %     \Bigl\langle \ten{A}\mlm_{k = 1}^{r} \mat{C}_k, \ten{B} \Bigr\rangle
 %     =
 %     \Bigl\langle \ten{A}, \ten{B}\mlm_{k = 1}^{r} \t{\mat{C}_k} \Bigr\rangle
 %     =
 %     \t{(\vec{\ten{B}})}\Bigl(\bigotimes_{k = r}^{1}\mat{C}_k\Bigr)\vec{\ten{A}}
 % \end{displaymath}
 % as well as for any tensor $\ten{A}$ of even order $2 r$ and matching square matrices $\mat{B}_k$ holds
 % \begin{displaymath}
 %     \K(\ten{A})\mlm_{k = 1}^{r}\t{(\vec\mat{B}_k)}
 %     =
 %     \t{(\vec{\ten{A}})}\vec\Bigl(\bigotimes_{k = r}^{1}\t{\mat{B}_k}\Bigr)
 % \end{displaymath}
 % \begin{lemma}\label{thm:kron-perm}
 %     Given $r \geq 2$ matrices $\mat{A}_k$ of dimension $p_j\times q_j$ for $k = 1, \ldots, r$, then there exists a unique permutation matrix $\mat{S}_{\mat{p}, \mat{q}}$ such that
 %     \begin{equation}\label{eq:kron-to-outer-perm}
 %         \vec\bigkron_{k = r}^{1}\mat{A}_k = \mat{S}_{\mat{p}, \mat{q}}\vec\bigouter_{k = 1}^{r}\mat{A}_k.
 %     \end{equation}
 %     The permutation $\mat{S}_{\mat{p}, \mat{q}}$ with indices $\mat{p} = (p_1, \ldots, p_r)$ and $\mat{q} = (q_1, \ldots, q_r)$ is defined recursively as
 %     \begin{equation}\label{eq:S_pq}
 %         \mat{S}_{\mat{p}, \mat{q}} = \mat{S}_{\bigl( \prod_{k = 1}^{r - 1}p_k, p_r \bigr), \bigl( \prod_{k = 1}^{r - 1}q_k, q_r \bigr)} \bigl(\mat{I}_{p_r q_r}\otimes\mat{S}_{(p_1, \ldots, p_{r-1}), (q_1, \ldots, q_{r-1})}\bigr)
 %     \end{equation}
 %     with initial value
 %     \begin{displaymath}
 %         \mat{S}_{(p_1, p_2), (q_1, q_2)} = \mat{I}_{q_2}\otimes\mat{K}_{q_1, p_2}\otimes\mat{I}_{p_1}
 %     \end{displaymath}
 %     where $\mat{K}_{p, q}$ is the \emph{commutation matrix} \cite[][Ch.~11]{AbadirMagnus2005}, that is the permutation such that $\vec{\t{\mat{A}}} = \mat{K}_{p, q}\vec{\mat{A}}$ for every $p\times q$ dimensional matrix $\mat{A}$.
 % \end{lemma}
 % \begin{proof}
 %     Lemma~7 in \cite{MagnusNeudecker1986} states that
 %     \begin{align}
 %         \vec(\mat{A}_2\otimes\mat{A}_1)
 %             &= (\mat{I}_{q_2}\otimes\mat{K}_{q_1, p_2}\otimes\mat{I}_{p_1})(\vec{\mat{A}_2}\otimes\vec{\mat{A}_1}) \label{eq:MagnusNeudecker1986-vec-kron-identity} \\
 %             &= (\mat{I}_{q_2}\otimes\mat{K}_{q_1, p_2}\otimes\mat{I}_{p_1})\vec(\mat{A}_1\circ \mat{A}_2). \nonumber
 %     \end{align}
 %     This proves the statement for $r = 2$. The general statement for $r > 2$ follows via induction. Assuming \eqref{eq:kron-to-outer-perm} holds for $r - 1$, the induction step is then;
 %     \begin{multline*}
 %         \vec{\bigkron_{k = r}^{1}}\mat{A}_k
 %         = \vec\Bigl(\mat{A}_r\otimes\bigkron_{k = r - 1}^{1}\mat{A}_k\Bigr) \\
 %         \overset{\eqref{eq:MagnusNeudecker1986-vec-kron-identity}}{=} \Bigl( \mat{I}_{q_r}\otimes\mat{K}_{\prod_{k = 1}^{r - 1}q_k, p_r}\otimes\mat{I}_{\prod_{k = 1}^{r - 1}p_k} \Bigr)\vec\Bigl((\vec\mat{A}_r)\otimes\vec\bigkron_{k = r - 1}^{1}\mat{A}_k\Bigr) \\
 %         = \mat{S}_{\bigl( \prod_{k = 1}^{r - 1}p_k, p_r \bigr), \bigl( \prod_{k = 1}^{r - 1}q_k, q_r \bigr)}\vec\Bigl[\Bigl(\vec\bigkron_{k = r - 1}^{1}\mat{A}_k\Bigr)\t{(\vec\mat{A}_r)}\Bigr] \\
 %         \overset{\eqref{eq:kron-to-outer-perm}}{=} \mat{S}_{\bigl( \prod_{k = 1}^{r - 1}p_k, p_r \bigr), \bigl( \prod_{k = 1}^{r - 1}q_k, q_r \bigr)}\vec\Bigl[\mat{S}_{(p_1, \ldots, p_{r-1}), (q_1, \ldots, q_{r-1})}\Bigl(\vec\bigouter_{k = 1}^{r - 1}\mat{A}_k\Bigr)\t{(\vec\mat{A}_r)}\Bigr] \\
 %         \overset{(a)}{=} \mat{S}_{\bigl( \prod_{k = 1}^{r - 1}p_k, p_r \bigr), \bigl( \prod_{k = 1}^{r - 1}q_k, q_r \bigr)} \bigl(\mat{I}_{p_r q_r}\otimes\mat{S}_{(p_1, \ldots, p_{r-1}), (q_1, \ldots, q_{r-1})}\bigr)\vec\Bigl[\Bigl(\vec\bigouter_{k = 1}^{r - 1}\mat{A}_k\Bigr)\t{(\vec\mat{A}_r)}\Bigr] \\
 %         = \mat{S}_{\mat{p}, \mat{q}}\vec\bigouter_{k = 1}^{r}\mat{A}_k.
 %     \end{multline*}
 %     Equality $(a)$ uses the relation $\vec(\mat{C}\mat{a}\t{\mat{b}}) = (\mat{I}_{\dim(\mat{b})}\otimes\mat{C})\vec(\mat{a}\t{\mat{b}})$ for a matrix $\mat{C}$ and vectors $\mat{a}, \mat{b}$.
 % \end{proof}
 % \begin{remark}
 %     The permutation matrix $\mat{K}_{p, q}$ represents a perfect outer $p$-shuffle of $p q$ elements.
 % \end{remark}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \section{Proofs}\label{app:proofs}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \begin{proof}[\hypertarget{proof:sdr}{Proof of \cref{thm:sdr}}]
-    A direct implication of Theorem~1 from \cite{BuraDuarteForzani2016} is that, under the exponential family \eqref{eq:quad-density} with natural statistic $\mat{t}(\ten{X})$,
+    A direct implication of Theorem~1 from \cite{BuraDuarteForzani2016} is that, under the exponential family \cref{eq:quad-density} with natural statistic $\mat{t}(\ten{X})$,
    \begin{displaymath}
        \t{\mat{\alpha}}(\mat{t}(\ten{X}) - \E\mat{t}(\ten{X}))
    \end{displaymath}
@ -474,7 +376,7 @@
 \end{proof}
 \begin{proof}[\hypertarget{proof:grad}{Proof of \cref{thm:grad}}]
-    We first note that for any exponential family with density \eqref{eq:quad-density} the term $b(\mat{\eta}_{y})$ differentiated with respect to the natural parameter $\mat{\eta}_{y}$ is the expectation of the statistic $\mat{t}(\ten{X})$ given $Y = y$. In our case, we get $\nabla_{\mat{\eta}_{y}}b = (\nabla_{\mat{\eta}_{1{y}}}b, \nabla_{\mat{\eta}_2}b)$ with components
+    We first note that for any exponential family with density \cref{eq:quad-density} the term $b(\mat{\eta}_{y})$ differentiated with respect to the natural parameter $\mat{\eta}_{y}$ is the expectation of the statistic $\mat{t}(\ten{X})$ given $Y = y$. In our case, we get $\nabla_{\mat{\eta}_{y}}b = (\nabla_{\mat{\eta}_{1{y}}}b, \nabla_{\mat{\eta}_2}b)$ with components
    \begin{displaymath}
        \nabla_{\mat{\eta}_{1{y}}}b
            = \E[\vec(\ten{X})\mid Y = y]
@ -491,7 +393,7 @@
            = \pinv{\mat{D}_p}\vec\E[\ten{X}\circ\ten{X}\mid Y = y].
    \end{multline*}
    The gradients are related to their derivatives by transposition, $\nabla_{\mat{\eta}_{1{y_i}}}b = \t{\D b(\mat{\eta}_{1{y}})}$ and $\nabla_{\mat{\eta}_2}b = \t{\D b(\mat{\eta}_2)}$.
-    Next, we provide the differentials of the natural parameter components from \eqref{eq:eta1} and \eqref{eq:eta2} in a quite direct form, without any further ``simplifications,'' because the down-stream computations will not benefit from re-expressing
+    Next, we provide the differentials of the natural parameter components from \cref{eq:eta1} and \cref{eq:eta2} in a quite direct form, without any further ``simplifications,'' because the down-stream computations will not benefit from re-expressing
    \begin{align*}
        \d\mat{\eta}_{1{y}}(\overline{\ten{\eta}})
            &= \d\vec{\overline{\ten{\eta}}}, \\
@ -503,7 +405,7 @@
    \end{align*}
    All other combinations, namely $\d\mat{\eta}_{1{y}}(\mat{\Omega}_j)$, $\d\mat{\eta}_2(\overline{\ten{\eta}})$ and $\d\mat{\eta}_2(\mat{\beta}_j)$, are zero.
-    Continuing with the partial differentials of $l_n$ from \eqref{eq:log-likelihood}
+    Continuing with the partial differentials of $l_n$ from \cref{eq:log-likelihood}
    \begin{multline*}
        \d l_n(\overline{\ten{\eta}})
            = \sum_{i = 1}^{n} (\langle \d\overline{\ten{\eta}}, \ten{X}_i \rangle - \D b(\mat{\eta}_{1{y_i}})\d\mat{\eta}_{1{y_i}}(\overline{\ten{\eta}}))
@ -708,9 +610,9 @@ We are now ready to provide and prove \cref{thm:M-estimator-asym-normal-on-manif
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{Asymptotic Normality of GMLM}
-We rewrite the log-likelihood \eqref{eq:log-likelihood} in a different form to simplify the proof of \cref{thm:asymptotic-normality-gmlm} and to provide the notation to express the regularity conditions of \cref{thm:asymptotic-normality-gmlm} in a compact form.
+We rewrite the log-likelihood \cref{eq:log-likelihood} in a different form to simplify the proof of \cref{thm:asymptotic-normality-gmlm} and to provide the notation to express the regularity conditions of \cref{thm:asymptotic-normality-gmlm} in a compact form.
-The first natural parameter component $\mat{\eta}_{1y}$ defined in \eqref{eq:eta1-manifold} can be written as
+The first natural parameter component $\mat{\eta}_{1y}$ defined in \cref{eq:eta1-manifold} can be written as
 \begin{align*}
    \mat{\eta}_{1y}
    &= \vec{\overline{\ten{\eta}}} + \mat{B}\vec{\ten{F}_y}= \mat{I}_p\vec{\overline{\ten{\eta}}} + (\t{(\vec{\ten{F}_y})}\otimes\mat{I}_p)\vec{\mat{B}} \\
@ -721,7 +623,7 @@ The first natural parameter component $\mat{\eta}_{1y}$ defined in \eqref{eq:eta
        \vec{\mat{B}}
    \end{pmatrix}.
 \end{align*}
-The second natural parameter component $\mat{\eta}_2$, modeled in \eqref{eq:eta2-manifold}, relates to $\vech{\mat{\Omega}}$ linearly as
+The second natural parameter component $\mat{\eta}_2$, modeled in \cref{eq:eta2-manifold}, relates to $\vech{\mat{\Omega}}$ linearly as
 \begin{displaymath}
    \mat{\eta}_2 = c\t{\mat{D}_p}\vec{\mat{\Omega}} = c\t{\mat{D}_p}\mat{D}_p\vech{\mat{\Omega}}.
 \end{displaymath}
@ -738,13 +640,13 @@ This gives the following relation between $\mat{\eta}_y = (\mat{\eta}_{1y}, \mat
 \end{pmatrix} =: \mat{F}(y)\mat{\xi} \label{eq:eta-to-xi-linear-relation}
 \end{equation}
 where $\mat{F}(y)$ is a $p (p + 3) / 2\times p (p + 2 q + 3) / 2$ dimensional matrix-valued function in $y$. Moreover, for every $y$ the matrix $\mat{F}(y)$ is of full rank.
-The log-likelihood of model \eqref{eq:quad-density} for the unconstrained parameters $\xi\in\Xi$ is
+The log-likelihood of model \cref{eq:quad-density} for the unconstrained parameters $\xi\in\Xi$ is
 \begin{displaymath}
    l_n(\mat{\xi})
        = \frac{1}{n}\sum_{i = 1}^{n} (\langle \mat{t}(\ten{X}), \mat{\eta}_{y} \rangle - b(\mat{\eta}_y))
        =: \frac{1}{n}\sum_{i = 1}^{n} m_{\mat{\xi}}(Z_i)
 \end{displaymath}
-where $Z_i = (\ten{X}_i, Y_i)$. Using \eqref{eq:eta-to-xi-linear-relation} we can write
+where $Z_i = (\ten{X}_i, Y_i)$. Using \cref{eq:eta-to-xi-linear-relation} we can write
 \begin{displaymath}
    m_{\mat{\xi}}(z) = \langle\mat{t}(\ten{X}), \mat{F}(y)\mat{\xi}\rangle - b(\mat{F}(y)\mat{\xi}).
 \end{displaymath}
@ -791,13 +693,13 @@ We continue with some more technical lemmas needed for the proof of \cref{thm:as
    &= (\mat{I}_{q_2}\otimes\mat{K}_{q_1, p_2}\otimes\mat{I}_{p_1})(\vec{\mat{A}_2}\otimes\vec{\mat{A}_1}) \label{eq:MagnusNeudecker1986-vec-kron-identity} \\
    &= (\mat{I}_{q_2}\otimes\mat{K}_{q_1, p_2}\otimes\mat{I}_{p_1})\vec(\mat{A}_1\circ \mat{A}_2). \nonumber
    \end{align}
-    This proves the statement for $r = 2$. The general statement for $r > 2$ follows by induction. Assuming \eqref{eq:kron-to-outer-perm} holds for $r - 1$, the induction step is 
+    This proves the statement for $r = 2$. The general statement for $r > 2$ follows by induction. Assuming \cref{eq:kron-to-outer-perm} holds for $r - 1$, the induction step is 
    \begin{multline*}
    \vec{\bigkron_{k = r}^{1}}\mat{A}_k
    = \vec\Bigl(\mat{A}_r\otimes\bigkron_{k = r - 1}^{1}\mat{A}_k\Bigr) \\
-    \overset{\eqref{eq:MagnusNeudecker1986-vec-kron-identity}}{=} \Bigl( \mat{I}_{q_r}\otimes\mat{K}_{\prod_{k = 1}^{r - 1}q_k, p_r}\otimes\mat{I}_{\prod_{k = 1}^{r - 1}p_k} \Bigr)\vec\Bigl((\vec\mat{A}_r)\otimes\vec\bigkron_{k = r - 1}^{1}\mat{A}_k\Bigr) \\
+    \overset{\cref{eq:MagnusNeudecker1986-vec-kron-identity}}{=} \Bigl( \mat{I}_{q_r}\otimes\mat{K}_{\prod_{k = 1}^{r - 1}q_k, p_r}\otimes\mat{I}_{\prod_{k = 1}^{r - 1}p_k} \Bigr)\vec\Bigl((\vec\mat{A}_r)\otimes\vec\bigkron_{k = r - 1}^{1}\mat{A}_k\Bigr) \\
    = \mat{S}_{\bigl( \prod_{k = 1}^{r - 1}p_k, p_r \bigr), \bigl( \prod_{k = 1}^{r - 1}q_k, q_r \bigr)}\vec\Bigl[\Bigl(\vec\bigkron_{k = r - 1}^{1}\mat{A}_k\Bigr)\t{(\vec\mat{A}_r)}\Bigr] \\
-    \overset{\eqref{eq:kron-to-outer-perm}}{=} \mat{S}_{\bigl( \prod_{k = 1}^{r - 1}p_k, p_r \bigr), \bigl( \prod_{k = 1}^{r - 1}q_k, q_r \bigr)}\vec\Bigl[\mat{S}_{(p_1, \ldots, p_{r-1}), (q_1, \ldots, q_{r-1})}\Bigl(\vec\bigouter_{k = 1}^{r - 1}\mat{A}_k\Bigr)\t{(\vec\mat{A}_r)}\Bigr] \\
+    \overset{\cref{eq:kron-to-outer-perm}}{=} \mat{S}_{\bigl( \prod_{k = 1}^{r - 1}p_k, p_r \bigr), \bigl( \prod_{k = 1}^{r - 1}q_k, q_r \bigr)}\vec\Bigl[\mat{S}_{(p_1, \ldots, p_{r-1}), (q_1, \ldots, q_{r-1})}\Bigl(\vec\bigouter_{k = 1}^{r - 1}\mat{A}_k\Bigr)\t{(\vec\mat{A}_r)}\Bigr] \\
    \overset{(a)}{=} \mat{S}_{\bigl( \prod_{k = 1}^{r - 1}p_k, p_r \bigr), \bigl( \prod_{k = 1}^{r - 1}q_k, q_r \bigr)} \bigl(\mat{I}_{p_rq_r}\otimes\mat{S}_{(p_1, \ldots, p_{r-1}), (q_1, \ldots, q_{r-1})}\bigr)\vec\Bigl[\Bigl(\vec\bigouter_{k = 1}^{r - 1}\mat{A}_k\Bigr)\t{(\vec\mat{A}_r)}\Bigr] \\
    =\mat{S}_{\mat{p},\mat{q}}\vec\bigouter_{k = 1}^{r}\mat{A}_k.
    \end{multline*}
@ -817,7 +719,7 @@ We continue with some more technical lemmas needed for the proof of \cref{thm:as
            = \bigkron_{k = r}^{j + 1}(\vec{\mat{A}_k})\otimes\mat{I}_{p_j q_j}\otimes\bigkron_{k = j - 1}^{1}(\vec{\mat{A}_k})
    \end{equation}
    and let $\gamma_j$ be $p_j q_j\times d_j$ matrices with $d_j \geq\dim\manifold{A}_j$ which span the tangent space $T_{\mat{A}_j}\manifold{A}_j$ of $\manifold{A}$ at $\mat{A}_j\in\manifold{A}_j$, that is $\Span\gamma_j = T_{\mat{A}_j}\manifold{A}_j$.
-    Then, with the permutation matrix $\mat{S}_{\mat{p}, \mat{q}}$ defined in \eqref{eq:S_pq}, the $p q \times \sum_{k = 1}^{r} d_j$ dimensional matrix
+    Then, with the permutation matrix $\mat{S}_{\mat{p}, \mat{q}}$ defined in \cref{eq:S_pq}, the $p q \times \sum_{k = 1}^{r} d_j$ dimensional matrix
    \begin{displaymath}
        \mat{P}_{\mat{A}} = \mat{S}_{\mat{p}, \mat{q}}\left[\mat{\Gamma}_1\mat{\gamma}_1, \mat{\Gamma}_2\mat{\gamma}_2, \ldots, \mat{\Gamma}_r\mat{\gamma}_r\right]
    \end{displaymath}
@ -825,7 +727,7 @@ We continue with some more technical lemmas needed for the proof of \cref{thm:as
 \end{lemma}
 \begin{proof}
    The statement that $\manifold{K}$ is an embedded manifold follows via induction using \cref{thm:kron-manifolds}.
-    We compute the differential of the vectorized Kronecker product using \cref{thm:kron-perm} where $\mat{S}_{\mat{p}, \mat{q}}$ is the permutation \eqref{eq:S_pq} defined therein.
+    We compute the differential of the vectorized Kronecker product using \cref{thm:kron-perm} where $\mat{S}_{\mat{p}, \mat{q}}$ is the permutation \cref{eq:S_pq} defined therein.
    \begin{multline*}
        \d\vec\bigotimes_{k = r}^{1}\mat{A}_k
        = \vec\sum_{j = 1}^{r}\bigkron_{k = r}^{1}(\ternary{k = j}{\d\mat{A}_j}{\mat{A}_k}) \\
@ -856,7 +758,7 @@ We continue with some more technical lemmas needed for the proof of \cref{thm:as
 \begin{proof}[\hypertarget{proof:asymptotic-normality-gmlm}{Proof of \cref{thm:asymptotic-normality-gmlm}}]
    The proof consists of three parts. First, we show the existence of a consistent strong M-estimator by applying \cref{thm:M-estimator-consistency-on-subsets}. Next, we apply \cref{thm:M-estimator-asym-normal-on-manifolds} to obtain its asymptotic normality. We conclude by computing the missing parts of the asymtotic covariance matrix $\mat{\Sigma}_{\mat{\theta}_0}$ provided by \cref{thm:M-estimator-asym-normal-on-manifolds}.
-    We check whether the conditions of \cref{thm:M-estimator-consistency-on-subsets} are satisfied. On $\Xi$, the mapping $\mat{\xi}\mapsto m_{\mat{\xi}}(z) = m_{\mat{\xi}}(\ten{X},y) = \langle \mat{F}(y)\mat{\xi}, \mat{t}(\ten{X}) \rangle - b(\mat{F}(y)\mat{\xi})$ is strictly concave for every $z$ because $\mat{\xi}\mapsto\mat{F}(y)\mat{\xi}$ is linear and $b$ is strictly convex by \cref{cond:differentiable-and-convex}. Since $\ten{X} \mid Y$ is distributed according to \eqref{eq:quad-density}, the function $M(\mat{\xi}) = \E m_{\mat{\xi}}(Z)$ is well defined by \cref{cond:moments}. Let $\mat{\xi}_k = (\vec{\overline{\ten{\eta}}_k}, \vec{\mat{B}_k}, \vech{\mat{\Omega}_k})$, and $f_{\mat{\xi}_k}$ be the pdf of $\ten{X} \mid Y$ indexed by $\mat{\xi}_k$, for $k = 1, 2$. If $\mat{\xi}_1\ne \mat{\xi}_2$, then $f_{\mat{\xi}_1} \neq f_{\mat{\xi}_2}$, which obtains that the true $\mat{\theta}_0$ is a unique maximizer of $\mat{\theta}_0\in\Theta\subseteq\Xi$ by applying Lemma~5.35 from \cite{vanderVaart1998}. Finally, under \cref{cond:finite-sup-on-compacta}, all assumptions of \cref{thm:M-estimator-consistency-on-subsets} are fulfilled yielding the existence of a consistent strong M-estimator over $\Theta\subseteq\Xi$.
+    We check whether the conditions of \cref{thm:M-estimator-consistency-on-subsets} are satisfied. On $\Xi$, the mapping $\mat{\xi}\mapsto m_{\mat{\xi}}(z) = m_{\mat{\xi}}(\ten{X},y) = \langle \mat{F}(y)\mat{\xi}, \mat{t}(\ten{X}) \rangle - b(\mat{F}(y)\mat{\xi})$ is strictly concave for every $z$ because $\mat{\xi}\mapsto\mat{F}(y)\mat{\xi}$ is linear and $b$ is strictly convex by \cref{cond:differentiable-and-convex}. Since $\ten{X} \mid Y$ is distributed according to \cref{eq:quad-density}, the function $M(\mat{\xi}) = \E m_{\mat{\xi}}(Z)$ is well defined by \cref{cond:moments}. Let $\mat{\xi}_k = (\vec{\overline{\ten{\eta}}_k}, \vec{\mat{B}_k}, \vech{\mat{\Omega}_k})$, and $f_{\mat{\xi}_k}$ be the pdf of $\ten{X} \mid Y$ indexed by $\mat{\xi}_k$, for $k = 1, 2$. If $\mat{\xi}_1\ne \mat{\xi}_2$, then $f_{\mat{\xi}_1} \neq f_{\mat{\xi}_2}$, which obtains that the true $\mat{\theta}_0$ is a unique maximizer of $\mat{\theta}_0\in\Theta\subseteq\Xi$ by applying Lemma~5.35 from \cite{vanderVaart1998}. Finally, under \cref{cond:finite-sup-on-compacta}, all assumptions of \cref{thm:M-estimator-consistency-on-subsets} are fulfilled yielding the existence of a consistent strong M-estimator over $\Theta\subseteq\Xi$.
    Next, let $\hat{\mat{\theta}}_n$ be a strong M-estimator on $\Theta\subseteq\Xi$, whose existence and consistency was shown in the previous step. Since $z\mapsto m_{\mat{\xi}}(z)$ is measurable for all $\mat{\xi}\in\Xi$, it is also measurable in a neighborhood of $\mat{\theta}_0$. The differentiability of  $\mat{\theta}\mapsto m_{\mat{\theta}}(z)$ is stated in \cref{cond:differentiable-and-convex}. For the Lipschitz condition, let $K\subseteq\Xi$ be a compact neighborhood of $\mat{\theta}_0$, which exists since $\Xi$ is open. Then,
    \begin{multline*}
@ -1007,7 +909,7 @@ which contains the $k$th mode first moment estimate in its diagonal $\hat{\mat{M
 For the purpose of an initial value estimate, we threat each of the columns of the matricized observation $(\ten{X}_i)_{(k)}$ as an i.i.d. observation of a $p_k$ dimensional random variable $Z_k$. Through this reasoning we get with $n$ i.i.d. observations $\ten{X}_i$ a total of $n \prod_{j\neq k}p_j$ realizations of the random variable $Z_k$ for each of the modes $k = 1, \ldots, r$.
-Continuing this reasoning, the elements of $(\hat{\mat{M}}_{1(k)})_{j}$ are the estimates of the marginal probability $P((Z_k)_j = 1)$ of the $j$th element of $Z_k$ being $1$. Similarly, for $l \neq j$, the entry $(\hat{\mat{M}}_{2(k)})_{j l}$ estimates the marginal probability of two-way interactions, $P((Z_k)_j = 1, (Z_k)_l = 1)$. Now, we set the diagonal elements of $\mat{\Omega}_k$ to zero. For the off diagonal elements of $\mat{\Omega}_k$, we equate the conditional probabilities $P((Z_k)_j = 1 \mid (Z_k)_{-j} = \mat{0})$ and $P((Z_k)_j = 1, (Z_k)_l = 1\mid (Z_k)_{-j, -l} = \mat{0})$ with the marginal probability estimates $(\hat{\mat{M}}_{1(k)})_{j}$ and $(\hat{\mat{M}}_{2(k)})_{j l}$, respectively. Applying \eqref{eq:ising-two-way-log-odds}  gives the initial component-wise estimates $\hat{\mat{\Omega}}_k^{(0)}$, 
+Continuing this reasoning, the elements of $(\hat{\mat{M}}_{1(k)})_{j}$ are the estimates of the marginal probability $P((Z_k)_j = 1)$ of the $j$th element of $Z_k$ being $1$. Similarly, for $l \neq j$, the entry $(\hat{\mat{M}}_{2(k)})_{j l}$ estimates the marginal probability of two-way interactions, $P((Z_k)_j = 1, (Z_k)_l = 1)$. Now, we set the diagonal elements of $\mat{\Omega}_k$ to zero. For the off diagonal elements of $\mat{\Omega}_k$, we equate the conditional probabilities $P((Z_k)_j = 1 \mid (Z_k)_{-j} = \mat{0})$ and $P((Z_k)_j = 1, (Z_k)_l = 1\mid (Z_k)_{-j, -l} = \mat{0})$ with the marginal probability estimates $(\hat{\mat{M}}_{1(k)})_{j}$ and $(\hat{\mat{M}}_{2(k)})_{j l}$, respectively. Applying \cref{eq:ising-two-way-log-odds}  gives the initial component-wise estimates $\hat{\mat{\Omega}}_k^{(0)}$, 
 \begin{equation}\label{eq:ising-init-Omegas}
    (\hat{\mat{\Omega}}_k^{(0)})_{j j} = 0,
    \qquad
@ -1020,7 +922,7 @@ Given initial values, the gradients derived in  \cref{thm:grad} can be evaluated
 \begin{equation}\label{eq:ising-m2}
    \ten{g}_2(\ten{\gamma}_y)_{(1, \ldots, r)} = \E\left[(\vec{\ten{X}})\t{(\vec{\ten{X}})}\mid Y = y\right] = p_0(\mat{\gamma}_y)\sum_{\mat{x}\in\{0, 1\}^{p}}\mat{x}\t{\mat{x}}\exp(\t{\vech(\mat{x}\t{\mat{x}})}\mat{\gamma}_y).
 \end{equation}
-The natural parameter $\mat{\gamma}_y$ is evaluated via \eqref{eq:ising-natural-params} enabling us to compute the partial gradients of the log-likelihood $l_n$ \eqref{eq:log-likelihood} for the Ising model by \cref{thm:grad} for the GMLM parameters $\mat{\beta}_k$ and $\mat{\Omega}_k$, $k = 1, \ldots, r$, at the current iterate $\mat{\theta}^{(I)} = (\mat{\beta}_1^{(I)}, \ldots, \mat{\beta}_r^{(I)}, \mat{\Omega}_1^{(I)}, \ldots, \mat{\Omega}_r^{(I)})$. Using classic gradient ascent for maximizing the log-likelihood, we have to specify a learning rate $\lambda\in\mathbb{R}_{+}$, usually a value close to $10^{-3}$. The update rule is
+The natural parameter $\mat{\gamma}_y$ is evaluated via \cref{eq:ising-natural-params} enabling us to compute the partial gradients of the log-likelihood $l_n$ \cref{eq:log-likelihood} for the Ising model by \cref{thm:grad} for the GMLM parameters $\mat{\beta}_k$ and $\mat{\Omega}_k$, $k = 1, \ldots, r$, at the current iterate $\mat{\theta}^{(I)} = (\mat{\beta}_1^{(I)}, \ldots, \mat{\beta}_r^{(I)}, \mat{\Omega}_1^{(I)}, \ldots, \mat{\Omega}_r^{(I)})$. Using classic gradient ascent for maximizing the log-likelihood, we have to specify a learning rate $\lambda\in\mathbb{R}_{+}$, usually a value close to $10^{-3}$. The update rule is
 \begin{displaymath}
    \mat{\theta}^{(I + 1)} = \mat{\theta}^{(I)} + \lambda\nabla_{\mat{\theta}} l_n(\mat{\theta})\bigr|_{\mat{\theta} = \mat{\theta}^{(I)}},
 \end{displaymath}
@ -1039,7 +941,7 @@ The parameters $\nu = 0.9$, $\lambda = 10^{-3}$ and $\epsilon\approx 1.49\cdot 1
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 In the case of a finite number of observations, specifically in data sets with a small number of observations $n$, the situation where one component is always either zero or one can occur. It is also possible to observe two exclusive components. In practice, this situation of a ``degenerate'' data set should be protected against. Working with parameters on a log scale, gives estimates of $\pm\infty$, which is outside the parameter space and breaks our optimization algorithm.
-The first situation where this needs to be addressed is in \eqref{eq:ising-init-Omegas}, where we set initial estimates for $\mat{\Omega}_k$. To avoid division by zero as well as evaluating the log of zero, we adapt \eqref{eq:ising-mode-moments}, the mode-wise moment estimates $\hat{\mat{M}}_{2(k)}$. A simple method is to replace the ``degenerate'' components, that are entries with value $0$ or $1$, with the smallest positive estimate of exactly one occurrence $p_k / n p$, or all but one occurrence $1 - p_k / n p$, respectively.
+The first situation where this needs to be addressed is in \cref{eq:ising-init-Omegas}, where we set initial estimates for $\mat{\Omega}_k$. To avoid division by zero as well as evaluating the log of zero, we adapt \cref{eq:ising-mode-moments}, the mode-wise moment estimates $\hat{\mat{M}}_{2(k)}$. A simple method is to replace the ``degenerate'' components, that are entries with value $0$ or $1$, with the smallest positive estimate of exactly one occurrence $p_k / n p$, or all but one occurrence $1 - p_k / n p$, respectively.
 The same problem is present in gradient optimization. Therefore, before starting the optimization, we detect degenerate combinations. We compute upper and lower bounds for the ``degenerate'' element in the Kronecker product $\hat{\mat{\Omega}} = \bigkron_{k = r}^{1}\hat{\mat{\Omega}}_k$. After every gradient update, we check if any of the ``degenerate'' elements fall outside of the bounds. In that case, we adjust all the elements of the Kronecker component estimates $\hat{\mat{\Omega}}_k$, corresponding to the ``degenerate'' element of their Kronecker product, to fall inside the precomputed bounds. While doing so, we try to alter every component as little as possible to ensure that the non-degenerate elements in $\hat{\mat{\Omega}}$, affected by this change due to its Kronecker structure, are altered as little as possible. The exact details are technically cumbersome while providing little insight.
@ -1047,9 +949,9 @@ The same problem is present in gradient optimization. Therefore, before starting
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{Slightly Bigger Dimensions}\label{sec:ising-bigger-dim}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-A big challenge for the Ising model is its high computational complexity as it involves summing over all binary vectors of length $p = \prod_{k = 1}^{r}p_k$ in the partition function \eqref{eq:ising-partition-function}. Exact computation of the partition function requires summing all $2^p$ binary vectors. For small dimensions, say $p\approx 10$, this is easily computed. Increasing the dimension beyond $20$ becomes extremely expensive and impossible for a dimension bigger than $30$. Trying to avoid the evaluation of the log-likelihood and only computing its partial gradients via \cref{thm:grad} does not resolve the issue. The gradients require the inverse link, that is the second moment \eqref{eq:ising-m2}, which still involves summing $2^p$ terms if  the scaling factor $p_0$ is dropped. Basically, with our model, this means that the optimization of the Ising model using exactly computed gradients is impossible for moderately sized problems.
+A big challenge for the Ising model is its high computational complexity as it involves summing over all binary vectors of length $p = \prod_{k = 1}^{r}p_k$ in the partition function \cref{eq:ising-partition-function}. Exact computation of the partition function requires summing all $2^p$ binary vectors. For small dimensions, say $p\approx 10$, this is easily computed. Increasing the dimension beyond $20$ becomes extremely expensive and impossible for a dimension bigger than $30$. Trying to avoid the evaluation of the log-likelihood and only computing its partial gradients via \cref{thm:grad} does not resolve the issue. The gradients require the inverse link, that is the second moment \cref{eq:ising-m2}, which still involves summing $2^p$ terms if  the scaling factor $p_0$ is dropped. Basically, with our model, this means that the optimization of the Ising model using exactly computed gradients is impossible for moderately sized problems.
-When $p=\prod_{i=1}^r p_i > 20$, we use a Monte-Carlo method to estimate the second moment \eqref{eq:ising-m2}, required to compute the partial gradients of the log-likelihood. Specifically, we use a Gibbs-Sampler to sample from the conditional distribution and approximate the second moment in an importance sampling framework. This can be implemented quite efficiently and the estimation accuracy for the second moment is evaluated experimentally. Simultaneously, we use the same approach to estimate the partition function. This, though, is inaccurate and may only be used to get a rough idea of the log-likelihood. Regardless, for our method, we only need the gradient for optimization where appropriate break conditions, not based on the likelihood, lead to a working method for MLE estimation.
+When $p=\prod_{i=1}^r p_i > 20$, we use a Monte-Carlo method to estimate the second moment \cref{eq:ising-m2}, required to compute the partial gradients of the log-likelihood. Specifically, we use a Gibbs-Sampler to sample from the conditional distribution and approximate the second moment in an importance sampling framework. This can be implemented quite efficiently and the estimation accuracy for the second moment is evaluated experimentally. Simultaneously, we use the same approach to estimate the partition function. This, though, is inaccurate and may only be used to get a rough idea of the log-likelihood. Regardless, for our method, we only need the gradient for optimization where appropriate break conditions, not based on the likelihood, lead to a working method for MLE estimation.
 \begin{figure}[!hpt]
    \centering 
@ -1095,7 +997,7 @@ which gives the complete $48$ dimensional vectorized reduction by stacking the p
        = (\vec{\ten{R}(\ten{X}_{\text{white pawn}})}, \ldots, \vec{\ten{R}(\ten{X}_{\text{black king}})})
        = \t{\mat{B}}\vec(\ten{X} - \E\ten{X}).
 \end{displaymath}
-The second line encodes all the piece-wise reductions in a block diagonal full reduction matrix $\mat{B}$ of dimension $768\times 48$ which is applied to the vectorized 3D tensor $\ten{X}$ combining all the piece components $\ten{X}_{\mathrm{piece}}$ into a single tensor of dimension $8\times 8\times 12$. This is a reduction to $6.25\%$ of the original dimension. The $R^2$ statistic of the GAM fitted on $10^5$ new reduced samples is $R^2_{\mathrm{gam}}\approx 46\%$. A linear model on the reduced data achieves $R^2_{\mathrm{lm}}\approx 26\%$ which clearly shows the non-linear relation. On the other hand, the static evaluation of the \emph{Schach H\"ornchen}\footnote{Main author's chess engine.} engine, given the full position (\emph{not} reduced), achieves an $R^2_{\mathrm{hce}}\approx 52\%$. The $42\%$ are reasonably well compared to $51\%$ of the engine static evaluation which gets the original position and uses chess specific expert knowledge. Features the static evaluation includes, which are expected to be learned by the GMLM mixture model, are the  \emph{material} (piece values) and \emph{piece square tables} (PSQT, preferred piece type positions). In addition, the static evaluation includes chess specific features like \emph{king safety}, \emph{pawn structure}, or \emph{rooks on open files}. This lets us conclude that the reduction captures most of the relevant features possible, given the oversimplified modeling we performed.
+The second line encodes all the piece-wise reductions in a block diagonal full reduction matrix $\mat{B}$ of dimension $768\times 48$ which is applied to the vectorized 3D tensor $\ten{X}$ combining all the piece components $\ten{X}_{\mathrm{piece}}$ into a single tensor of dimension $8\times 8\times 12$. This is a reduction to $6.25\%$ of the original dimension. The $R^2$ statistic of the GAM fitted on $10^5$ new reduced samples is $R^2_{\mathrm{gam}}\approx 46\%$. A linear model on the reduced data achieves $R^2_{\mathrm{lm}}\approx 26\%$ which clearly shows the non-linear relation. On the other hand, the static evaluation of the \emph{Schach H\"ornchen}\footnote{Main author's chess engine.} engine, given the full position (\emph{not} reduced), achieves an $R^2_{\mathrm{hce}}\approx 52\%$. The $46\%$ are reasonably well compared to $52\%$ of the engine static evaluation which gets the original position and uses chess specific expert knowledge. Features the static evaluation includes, which are expected to be learned by the GMLM mixture model, are the  \emph{material} (piece values) and \emph{piece square tables} (PSQT, preferred piece type positions). In addition, the static evaluation includes chess specific features like \emph{king safety}, \emph{pawn structure}, or \emph{rooks on open files}. This lets us conclude that the reduction captures most of the relevant features possible, given the oversimplified modeling we performed.
 \subsection{Interpretation}
 For a compact interpretation of the estimated reduction we construct PSQTs. To do so we use the linear model from the validation section. Then, we rewrite the combined linear reduction and linear model in terms of PSQTs. Due to the nature of our analysis, it does not provide the usual PSQT interpreted as the square where a piece is most powerful, but instead that the presence of a piece on a particular square is indicative of a winning position. Those two are different in the sense that the first indicates the potential of a piece on a particular square while the second also incorporates the cases where this potential was already used, resulting in a winning position (often due to an oversight of the opponent).
@ -1104,7 +1006,7 @@ Let $\mat{B}$ be the $768\times 48$ full vectorized linear reduction. This is th
 \begin{equation}\label{eq:chess-lm}
    y = a + \t{\mat{b}}\t{\mat{B}}\vec(\ten{X} - \E\ten{X}) + \epsilon
 \end{equation}
-with an unknown mean zero error term $\epsilon$ and treating the binary tensor $\ten{X}$ as continuous. Decomposing the linear model coefficients into blocks of $4$ gives per piece coefficients $\mat{b}_{\mathrm{piece}}$ which combine with the diagonal blocks $\mat{B}_{\mathrm{piece}}$ of $\mat{B}$ only. Rewriting \eqref{eq:chess-lm} gives
+with an unknown mean zero error term $\epsilon$ and treating the binary tensor $\ten{X}$ as continuous. Decomposing the linear model coefficients into blocks of $4$ gives per piece coefficients $\mat{b}_{\mathrm{piece}}$ which combine with the diagonal blocks $\mat{B}_{\mathrm{piece}}$ of $\mat{B}$ only. Rewriting \cref{eq:chess-lm} gives
 \begin{align*}
    y &= a + \sum_{\mathrm{piece}}\t{(\mat{B}_{\mathrm{piece}}\mat{b}_{\mathrm{piece}})}\vec(\ten{X}_{\mathrm{piece}} - \E\ten{X}_{\mathrm{piece}}) + \epsilon \\
    &= \tilde{a} + \sum_{\mathrm{piece}}\langle