tensor_predictors/AOS-accepted/main.tex

\documentclass[aos]{imsart}

%% Packages
\usepackage[utf8]{inputenc}
\usepackage[LSF, T1]{fontenc}
% \usepackage{lmodern}  % TODO: interfers with "imsart" classed author address display
\usepackage{amsthm, amsmath, amsfonts, amssymb, bm, pifont}
\usepackage{float}
\usepackage{chessfss}
\usepackage{scalerel}
\usepackage[dvipsnames]{xcolor}
\usepackage{graphicx}
\usepackage[authoryear]{natbib}
\usepackage[colorlinks, citecolor = blue, urlcolor = blue]{hyperref}
\usepackage[noabbrev, capitalize, nameinlink]{cleveref}     % after hyperref


\startlocaldefs
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%                                          %%
%% Uncomment next line to change            %%
%% the type of equation numbering           %%
%%                                          %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\numberwithin{equation}{section}
%%                                          %%
%% For Axiom, Claim, Corollary, Hypothesis, %%
%% Lemma, Theorem, Proposition              %%
%% use \theoremstyle{plain}                 %%
%%                                          %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{corollary}{Corollary}
\newtheorem{proposition}{Proposition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%                                          %%
%% For Assumption, Definition, Example,     %%
%% Notation, Property, Remark, Fact         %%
%% use \theoremstyle{remark}                %%
%%                                          %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\theoremstyle{remark}
\newtheorem{definition}{Definition}
\newtheorem{condition}{Condition}
\newtheorem{example}{Example}
\newtheorem{remark}{Remark}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Please put your definitions here:        %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%% Clever ref additional reference name
\crefname{condition}{Condition}{Conditions}
\Crefname{condition}{Condition}{Conditions}
\crefrangelabelformat{condition}{#3#1#4-#5#2#6}

% matrices
\newcommand*{\mat}[1]{\boldsymbol{#1}}
% tensors (special case for lower case caligraphic letters)
\newcommand*{\ten}[1]{
    \ifnum\pdfstrcmp{#1}{`}=1       % lowercase argument
    \mathfrak{#1}
    \else                           % uppercase argument
    \mathcal{#1}
    \fi
}
\newcommand{\manifold}[1]{\mathfrak{#1}}

% Define math macros
\renewcommand{\hat}{\widehat}
\renewcommand*{\vec}{\operatorname{vec}}
\newcommand*{\vech}{\operatorname{vech}}
\newcommand*{\rank}{\operatorname{rank}}
\newcommand*{\diag}{\operatorname{diag}}
\DeclareMathOperator{\tr}{tr}
\DeclareMathOperator{\var}{Var}
\DeclareMathOperator{\cov}{Cov}
\DeclareMathOperator{\Span}{span}
\DeclareMathOperator{\E}{\operatorname{\mathbb{E}}}
\DeclareMathOperator*{\argmax}{{arg\,max}}
\newcommand*{\D}{\textnormal{D}}        % derivative
\renewcommand*{\H}{\textnormal{H}}      % hessian
\renewcommand*{\d}{\textnormal{d}}      % differential
\renewcommand*{\t}[1]{{#1^{T}}}         % matrix transpose
\newcommand*{\pinv}[1]{{#1^{\dagger}}}  % `Moore-Penrose pseudoinverse`
% rearangment operator, generalization of Van-Loan and Pitzianis rearrangement opreration
\newcommand*{\K}{\mathcal{K}}

\renewcommand{\checkmark}{{\color{Green}\ding{51}}}
\newcommand{\xmark}{{\color{Red!70}\ding{55}}}

% Special Matrix Sets (Manifolds)
\newcommand{\StiefelNonCompact}[2]{\mathbb{R}_{*}^{{#1}\times {#2}}}
\newcommand{\Stiefel}[2]{\mathrm{St}^{{#1}\times {#2}}}
\newcommand{\SymMat}[1]{\mathrm{Sym}^{{#1}\times {#1}}}
\newcommand{\SymPosDefMat}[1]{\mathrm{Sym}_{++}^{{#1}\times {#1}}}
\newcommand{\OrthogonalGrp}[1]{\mathrm{O}(#1)}
\newcommand{\SpecialOrthogonalGrp}[1]{\mathrm{SO}(#1)}

%%% Custom operators with ether one or two arguments (limits)
\makeatletter
%%% Multi-Linear Multiplication
% $\mlm_{k \in [r]}$ or $\mlm_{k = 1}^{r}$ (lower limit MUST be the first!)
% Save first argument as \arg@one
\def\mlm_#1{\def\arg@one{#1}\futurelet\next\mlm@i}
% Check for second argument
\def\mlm@i{\ifx\next^\expandafter\mlm@two\else\expandafter\mlm@one\fi}
% specialization for one or two arguments, both versions use saved first argument
\def\mlm@one{\mathchoice%
    {\operatorname*{\scalerel*[1.65em]{\times}{\bigotimes}}_{\makebox[0pt][c]{$\scriptstyle \arg@one$}}}%
    {\operatorname*{\scalerel*[1.65em]{\times}{\bigotimes}}_{\arg@one}}%
    {\operatorname*{\scalerel*[1.65em]{\times}{\bigotimes}}_{\arg@one}}%
    {\operatorname*{\scalerel*[1.65em]{\times}{\bigotimes}}_{\arg@one}}%
}
% this commands single argument is the second argument of \mlm, it gobbles the `^`
\def\mlm@two^#1{\mathchoice%
    {\operatorname*{\scalerel*[1.65em]{\times}{\bigotimes}}_{\makebox[0pt][c]{$\scriptstyle \arg@one$}}^{\makebox[0pt][c]{$\scriptstyle #1$}}}%
    {\operatorname*{\scalerel*[1.65em]{\times}{\bigotimes}}_{\arg@one}^{#1}}%
    {\operatorname*{\scalerel*[1.65em]{\times}{\bigotimes}}_{\arg@one}^{#1}}%
    {\operatorname*{\scalerel*[1.65em]{\times}{\bigotimes}}_{\arg@one}^{#1}}%
}

%%% Big Circle (Iterated Outer Product)
\def\bigouter_#1{\def\arg@one{#1}\futurelet\next\bigouter@i}
\def\bigouter@i{\ifx\next^\expandafter\bigouter@two\else\expandafter\bigouter@one\fi}
\def\bigouter@one{\mathchoice%
    {\operatorname*{\scalerel*{\circ}{\bigotimes}}_{\makebox[0pt][c]{$\scriptstyle \arg@one$}}}%
    {\operatorname*{\scalerel*{\circ}{\bigotimes}}_{\arg@one}}%
    {\operatorname*{\scalerel*{\circ}{\bigotimes}}_{\arg@one}}%
    {\operatorname*{\scalerel*{\circ}{\bigotimes}}_{\arg@one}}%
}
\def\bigouter@two^#1{\mathchoice%
    {\operatorname*{\scalerel*{\circ}{\bigotimes}}_{\makebox[0pt][c]{$\scriptstyle \arg@one$}}^{\makebox[0pt][c]{$\scriptstyle #1$}}}%
    {\operatorname*{\scalerel*{\circ}{\bigotimes}}_{\arg@one}^{#1}}%
    {\operatorname*{\scalerel*{\circ}{\bigotimes}}_{\arg@one}^{#1}}%
    {\operatorname*{\scalerel*{\circ}{\bigotimes}}_{\arg@one}^{#1}}%
}

%%% Big Kronecker Product (with overflowing limits)
% Save first argument as \arg@one
\def\bigkron_#1{\def\arg@one{#1}\futurelet\next\bigkron@i}
% Check for second argument
\def\bigkron@i{\ifx\next^\expandafter\bigkron@two\else\expandafter\bigkron@one\fi}
% specialization for one or two arguments, both versions use saved first argument
\def\bigkron@one{\mathchoice%
    {\bigotimes_{\makebox[0pt][c]{$\scriptstyle \arg@one$}}}%
    {\bigotimes_{\arg@one}}%
    {\bigotimes_{\arg@one}}%
    {\bigotimes_{\arg@one}}%
}
% this commands single argument is the second argument of \bigkron
\def\bigkron@two^#1{\mathchoice%
    {\bigotimes_{\makebox[0pt][c]{$\scriptstyle \arg@one$}}^{\makebox[0pt][c]{$\scriptstyle #1$}}}%
    {\bigotimes_{\arg@one}^{#1}}%
    {\bigotimes_{\arg@one}^{#1}}%
    {\bigotimes_{\arg@one}^{#1}}%
}
\makeatother

%%% "Fix" additional spacing around \left(...\right),
% see: https://tex.stackexchange.com/questions/2607/spacing-around-left-and-right
\let\originalleft\left
\let\originalright\right
\renewcommand{\left}{\mathopen{}\mathclose\bgroup\originalleft}
\renewcommand{\right}{\aftergroup\egroup\originalright}

\endlocaldefs

\begin{document}

\begin{frontmatter}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%                                          %%
%% Enter the title of your article here     %%
%%                                          %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\title{Generalized Multilinear Models for Sufficient Dimension Reduction on Tensor-valued Predictors}
\runtitle{GMLM for SDR on tensor-valued Predictors}

\begin{aug}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Only one address is permitted per author. %%
%% Only division, organization and e-mail is %%
%% included in the address.                  %%
%% Additional information can be included in %%
%% the Acknowledgments section if necessary. %%
%% ORCID can be inserted by command:         %%
%% \orcid{0000-0000-0000-0000}               %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\author[A]{\fnms{Daniel}~\snm{Kapla}\ead[label=e1]{daniel.kapla@tuwien.ac.at}}
\and
\author[A]{\fnms{Efstathia}~\snm{Bura}\ead[label=e2]{efstathia.bura@tuwien.ac.at}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Addresses                                %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\address[A]{Institute of Statistics and Mathematical Methods in Economics, Faculty of Mathematics and Geoinformation, TU Wien \\
\printead[presep={\ }]{e1,e2}}
\end{aug}

\begin{abstract}
We consider supervised learning problems with tensor-valued input. We derive multi-linear sufficient reductions for the regression or classification problem by modeling the conditional distribution of the predictors given the response as a member of the quadratic exponential family. We develop estimation procedures of sufficient reductions for both continuous and binary tensor-valued predictors. We prove the consistency and asymptotic normality of the estimated sufficient reduction using manifold theory. For multi-linear normal predictors, the estimation algorithm is highly computationally efficient and is also applicable to situations where the dimension of the reduction exceeds the sample size. Our method outperforms competing techniques in both simulated settings and real-world datasets involving continuous and binary tensor-valued predictors.
\end{abstract}

%%% see: https://mathscinet.ams.org/mathscinet/msc/msc2020.html
%% 62B05 - Sufficient statistics and fields
%% 62E20 - Asymptotic distribution theory in statistics
%% 62F12 - Asymptotic properties of parametric estimators
%% 62F30 - Parametric inference under constraints
%% 62H12 - Estimation in multivariate analysis

\begin{keyword}[class=MSC]
\kwd[Primary ]{62E20}
\kwd{62J05, 62F12}
\kwd[; secondary ]{62F30}
\kwd{62B05, 15A69}
\end{keyword}

\begin{keyword}
\kwd{Regression}
\kwd{Asymptotics}
\kwd{Manifold Theory}
\kwd{Constrained Optimization}
\kwd{Maximum Likelihood Estimation}
\end{keyword}

\end{frontmatter}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%           Main text entry area           %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Introduction}\label{sec:introduction}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Complex data are collected at different times and/or under conditions often involving a large number of multi-indexed variables represented as tensor-valued data \cite[]{KoldaBader2009}. A tensor is a multi-dimensional array of numbers. As such, \textit{Tensors}  are a mathematical tool to represent data of complex structure in that they are a generalization of matrices to higher dimensions. They occur in large-scale longitudinal studies \cite[e.g.][]{Hoff2015}, in agricultural experiments, chemometrics and spectroscopy \cite[e.g.][]{LeurgansRoss1992,Burdick1995}, in signal and video processing where sensors produce multi-indexed data, e.g. over spatial, frequency, and temporal dimensions \cite[e.g.][]{DeLathauwerCastaing2007,KofidisRegalia2001}, and in telecommunications \cite[e.g.][]{DeAlmeidaEtAl2007}. Other examples of multiway data include 3D images of the brain, where the modes are the 3 spatial dimensions, and spatio-temporal weather imaging data, a set of image sequences represented as 2 spatial modes and 1 temporal mode.

Tensor regression models have been proposed to leverage the structure inherent in tensor-valued data. For instance, \cite{HaoEtAl2021,ZhouLiZhu2013} focus on tensor covariates, while \cite{RabusseauKadri2016,LiZhang2017,ZhouLiZhu2013} focus on tensor responses, and \cite{Hoff2015,Lock2018} consider tensor on tensor regression. \cite{HaoEtAl2021} modeled a scalar response as a flexible nonparametric function of tensor covariates. \cite{ZhouLiZhu2013} assume the scalar response has a distribution in the exponential family given the tensor-valued predictors with the link modeled as a multi-linear function of the predictors. \cite{RabusseauKadri2016} model the tensor-valued response as a linear model with tensor-valued regression coefficients subject to a multi-linear rank constraint. \cite{LiZhang2017} approach the problem with a similar linear model but instead of a low-rank constraint the error term is assumed to have a separable Kronecker product structure while using a generalization of the envelope model \cite[]{CookLiChiaromonte2010}. \cite{ZhouEtAl2023} focus on partially observed tensor response given vector-valued predictors with mode-wise sparsity constraints in the regression coefficients. \cite{Hoff2015} extends an existing bilinear regression model to a tensor on tensor of conformable modes and dimensions regression model based on a Tucker product. \cite{Lock2018} uses a tensor contraction to build a penalized least squares model for a tensor with an arbitrary number of modes and dimensions.

We consider the general regression problem of fitting a response of general form (univariate, multivariate, tensor-valued) on a tensor-valued predictor. We operate in the context of \textit{sufficient dimension reduction} (SDR) \cite[e.g.][]{Cook1998,Li2018} based on inverse regression, which leads to regressing the tensor-valued predictor on tensor-valued functions of the response (tensor on tensor regression). Ours is a \textit{model-based} SDR method for tensor-valued data, where the conditional distribution of the inverse predictors ($\ten{X}$) belongs to a parametric family whose parameter depends on the response of the forward regression of $Y$ on $\ten{X}$. Specifically, we assume the conditional distribution of the tensor-valued predictors given the response belongs to the quadratic exponential family for which the first and second moments admit a separable Kronecker product structure. The quadratic exponential family contains the multi-linear normal and the multi-linear Ising distributions for continuous and binary tensor-valued random variables, respectively.

We obtain the maximum dimension reduction of the tensor-valued predictor without losing any information about the response. We derive maximum likelihood  estimates of the sufficient reduction that enjoy the attendant optimality properties, such as consistency, efficiency and asymptotic normality.
The main challenge in estimation in multi-linear tensor regression models is the non-identifiability of the parameters, as has been acknowledged by researchers in this field (e.g., \cite{LiZhang2017, Lock2018}). In contrast to past approaches (e.g., \cite{ZhouLiZhu2013,Hoff2015, SunLi2017, LiZhang2017, LlosaMaitra2023}), ours does not require any penalty terms and/or sparsity constraints in order to address this issue. Instead, we model the parameter space as a smooth embedded manifold to ensure identifiability while enabling rich modeling flexibility.

Our main contributions are (i) formulating the dimension reduction problem via the quadratic exponential family modeling up to two-way interaction that allows us to derive the minimal sufficient dimension reduction in closed form, (ii) great flexibility in modeling by defining the parameter space as a smooth embedded manifold, (iii) deriving the maximum likelihood estimator of the sufficient reduction subject to multi-linear constraints and overcoming parameter non-identifiability, (iv) establishing the consistency and asymptotic normality of the estimators, and (v) developing computationally efficient estimation algorithms (very fast in the case of multi-linear normal predictors).

We clarify our approach and highlight our contributions via a simple example where the predictor is matrix-valued (tensor of order $2$) and the response is univariate (binary). The electroencephalography (EEG) data set\footnote{\textsc{Begleiter, H.} (1999). EEG Database. \textit{Neurodynamics Laboratory, State University of New York Health Center} Donated by Lester Ingber. \url{http://kdd.ics.uci.edu/databases/eeg/eeg.data.html}} of $77$ alcoholic and $45$ control subjects is commonly used in EEG signal analysis. EEG is a noninvasive neuroimaging technique that involves the placement of electrodes on the scalp to record electrical activity of the brain. Each subject data point consists of a $p_1\times p_2 = 256\times 64$ matrix, with each row representing a time point and each column a channel. The measurements were obtained by exposing each individual to visual stimuli and measuring voltage values from $64$ electrodes placed on the subjects' scalps sampled at $256$ time points over $1$ second ($256$ Hz). For each subject $120$ trials were measured under different stimulus conditions. To contrast our approach to previous methods applied to this data set (e.g., \cite{LiKimAltman2010,PfeifferForzaniBura2012,DingCook2015,PfeifferKaplaBura2021}), we use a single stimulus condition (S1), and for each subject, we average over all the trials under this condition. The data are $(\ten{X}_i, Y_i)$, $i = 1, \ldots, 122$, where $\ten{X}_i$ is a $256\times 64$ matrix (two-mode tensor), with each entry representing the mean voltage value of subject $i$ at a combination of a time point and a channel, averaged over all trials under the S1 stimulus condition, and $Y$ is a binary outcome variable with $Y_i = 1$ for an alcoholic and $Y_i = 0$ for a control subject.

We assume $\ten{X} \mid Y$ follows the bilinear model
\begin{equation}\label{eq:bilinear}
    \ten{X} = \bar{\mat{\eta}} + \mat{\alpha}_1\mat{F_Y}\t{\mat{\alpha}_2} + \mat{\epsilon}
\end{equation}
where $\mat{F}_Y\equiv Y$ is a $1\times 1$ matrix (in general, $\mat{F}_Y$ is a tensor-valued function of $Y$), $\vec{\mat{\epsilon}}\sim\mathcal{N}(\mat{0}, \mat{\Sigma}_2\otimes\mat{\Sigma}_1)$, and $\otimes$ signifies the Kronecker product operator. Model \eqref{eq:bilinear} expresses that $\E(\ten{X}\mid Y) = \bar{\mat{\eta}} + \mat{\alpha}_1\mat{F}_Y\t{\mat{\alpha}_2}$ solely contains the information in $Y$ about $\ten{X}$. Alternatively, \eqref{eq:bilinear} can be written as $\ten{X} = \bar{\mat{\eta}} + \mat{\alpha}_1 \mat{F_Y}\t{\mat{\alpha}_2} + \mat{\Sigma}_1^{-1/2}\mat{U}\mat{\Sigma}_2^{-1/2}$, where $\vec{\mat{U}}\sim\mathcal{N}(\mat{0}, \mat{I}_{p_2}\otimes\mat{I}_{p_1})$. Thus, $\ten{X}$ follows a matrix normal distribution with mean $\bar{\mat{\eta}} + \mat{\alpha}_1 \mat{F_Y}\t{\mat{\alpha}_2}$ and variance-covariance structure $\mat{\Sigma}_2\otimes\mat{\Sigma}_1$. We derive the \textit{minimal sufficient reduction} of the predictor $\ten{X}$ for the regression (classification) of $Y$ on $\ten{X}$ to be
\begin{displaymath}
    \ten{R}(\ten{X})
    = \t{(\mat{\Sigma}_1^{-1}\mat{\alpha}_1)}(\ten{X}-\bar{\mat{\eta}})(\mat{\Sigma}_2^{-1}\mat{\alpha}_2)
    = \t{\mat{\beta}_1}(\ten{X}-\bar{\mat{\eta}})\mat{\beta}_2.
\end{displaymath}
Our approach involves optimization of the likelihood of $\ten{X}$ conditional on $Y$ subject to the non-convex nonlinear constraint that both the mean and covariance parameter spaces have Kronecker product structure, where the components of the Kronecker product are non-identifiable. This is resolved by letting the parameter space be a smooth embedded submanifold. We derive the maximum likelihood estimator of the minimal sufficient reduction and use it to predict the binary response. Our method exhibits uniformly better classification accuracy when compared to competing existing techniques (see Table \ref{tab:eeg}). A key advantage of our approach is that it circumvents the dimensionality challenge ($p_1 \times p_2 = 256 \times 64 = 16384 \gg 122 = n$) without having to artificially reduce the dimension by preprocessing the data, as in \cite{LiKimAltman2010, PfeifferForzaniBura2012, PfeifferKaplaBura2021, DingCook2015}, or imposing simplifying assumptions and/or sparsity or regularization constraints \cite[]{PfeifferKaplaBura2021,LiZhang2017,SunLi2017, Lock2018}.

Even though our motivation is rooted in the SDR perspective, our inverse regression approach also  applies to any regression model with a tensor-valued response and predictors of any type. Thus, it can be used as a stand-alone model for such data regardless of whether one is interested in deriving sufficient reductions or simply regressing tensor-valued variables on any type of predictor. Our tensor-to-tensor inverse regression model is a generalized multi-linear model that shares a similar structural form with the generalized linear model of \cite{ZhouLiZhu2013, Hoff2015} and \cite{SunLi2017}, but beyond that, the resemblance does not extend further. In effect, our model can be viewed as an extension of reduced rank regression for tensor-valued response regression.

The structure of this paper is as follows. We begin by introducing notation in \cref{sec:notation}, followed by a formal definition of the problem in \Cref{sec:problem-formulation}. The proposed model is specified in \cref{sec:gmlm-model}.
\Cref{sec:manifolds} provides a brief introduction to manifolds, serving as the basis for the consistency and asymptotic normality results detailed in \cref{sec:statprop}. A general maximum likelihood estimation procedure is presented and we derive specialized methods for the multi-linear normal and multi-linear Ising distributions in \cref{sec:ml-estimation}.
Simulations for continuous and binary tensor-valued predictors are carried out in \cref{sec:simulations}. We apply our model to EEG data, where the predictor takes the form of two- and three-dimensional arrays, as presented in \cref{sec:data-analysis}.
Finally, we summarize our contributions and highlight potential directions for future research in \cref{sec:discussion}.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Notation}\label{sec:notation}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Capital Latin letters in \LaTeX{} fraktur font will denote tensors, and Latin and Greek letters in boldface will denote vectors and matrices throughout the paper. A multiway array $\ten{A}$ of dimension $q_1\times \ldots\times q_r$ is a tensor of order\footnote{Also referred to as rank, hence the variable name $r$. We refrain from using this term to avoid confusion with the rank of a matrix.} $r$, in short $r$-tensor, where $r\in\mathbb{N}$ is the number of its modes or axes (dimensions), and we denote it by $\ten{A}\in\mathbb{R}^{q_1\times \ldots\times q_r}$ where $\ten{A}_{i_1,...,i_r} \in \mathbb{R}$ is its $(i_1, \ldots, i_r)$th entry. For example, a $p \times q$ matrix $\mat{B}$ is a tensor of \textit{order} 2 as it has two modes, the rows and columns.

The $k$-mode product $\ten{A}\times_k\mat{B}$ of a tensor $\ten{A}\in\mathbb{R}^{q_1\times q_2\times\cdots\times q_r}$ with a matrix $\mat{B}\in\mathbb{R}^{p\times q_k}$ is a generalization of matrix multiplication that operates along the $k$-th mode of the tensor, summing over the corresponding index $i_k$ to produce a new tensor with appropriately adjusted dimensions:
\begin{displaymath}
    (\ten{A}\times_k\mat{B})_{i_1, \ldots, i_{k-1}, j, i_{k+1}, \ldots, i_r}
    = \sum_{i_k = 1}^{q_k} \ten{A}_{i_1, \ldots, i_r}\ten{B}_{j, i_k}.
\end{displaymath}
The notation $\ten{A}\mlm_{k\in S}\mat{B}_k$ is shorthand for the iterative application of the mode product with matching matrices $\mat{B}_k$, for all indices $k\in S\subseteq\{1, \ldots, r\}$. For example $\ten{A}\mlm_{k\in\{2, 5\}}\mat{B}_k = \ten{A}\times_2\mat{B}_2\times_5\mat{B}_5$. By only allowing $S$ to be a set, this notation is unambiguous because the mode product commutes for different modes; i.e., $\ten{A}\times_j\mat{B}_j\times_k\mat{B}_k = \ten{A}\times_k\mat{B}_k\times_j\mat{B}_j$ for $j\neq k$. For matrices $\mat{A}, \mat{B}_1, \mat{B}_2$, the relation to classic matrix-matrix multiplication is
\begin{displaymath}
    \mat{A}\times_1\mat{B}_1 = \mat{B}_1\mat{A}, \qquad
    \mat{A}\times_2\mat{B}_2 = \mat{A}\t{\mat{B}_2}, \qquad
    \mat{A}\mlm_{k = 1}^2\mat{B}_k = \mat{A}\mlm_{k \in \{1, 2\}}\mat{B}_k = \mat{B}_1\mat{A}\t{\mat{B}_2}.
\end{displaymath}
An application of the mode product over all indices is also known as the \emph{multi-linear multiplication}, or \emph{Tucker operator} \cite[]{Kolda2006,KofidisRegalia2001}.

The \emph{vectorization} $\vec(\ten{A})$ of a tensor $\ten{A}$ is a vector consisting of the same elements as $\ten{A}$ arranged in lexicographic index order. We use the notation $\ten{A}\equiv \ten{B}$ if and only if $\vec(\ten{A}) = \vec(\ten{B})$. The \emph{flattening} (or \emph{unfolding} or \emph{matricization}) of $\ten{A}$ along mode $k$, denoted as $\ten{A}_{(k)}$, is defined as the matrix obtained by rearranging the elements of $\ten{A}$ such that the $k$-th mode is represented as the columns of the resulting matrix (see Appendix~A in the supplementary material for examples.)

The \emph{inner product} between two tensors with the same number of elements is $\langle\ten{A}, \ten{B}\rangle = \t{(\vec\ten{A})}(\vec\ten{B})\in\mathbb{R}$ (allowing tensors of different order), and the \emph{Frobenius norm} for tensors is $\|\ten{A}\|_F = \sqrt{\langle\ten{A}, \ten{A}\rangle}$. The \emph{outer product} between two tensors $\ten{A}$ of dimensions $q_1, \ldots, q_r$ and $\ten{B}$ of dimensions $p_1, \ldots, p_l$ is a tensor $\ten{A}\circ\ten{B}$ of order $r + l$ and dimensions $q_1, \ldots, q_r, p_1, \ldots, p_l$, such that $\ten{A}\circ\ten{B} \equiv (\vec\ten{A})\t{(\vec{\ten{B}})}$. The iterated outer and iterated Kronecker products are written as
\begin{displaymath}
    \bigouter_{k = 1}^{r} \mat{A}_k = \mat{A}_1\circ\ldots \circ\mat{A}_r,
    \qquad
    \bigkron_{k = 1}^{r} \mat{A}_k = \mat{A}_1\otimes\ldots \otimes\mat{A}_r
\end{displaymath}
where the order of iteration is important. Similar to the outer product, we extend the Kronecker product to $r$-tensors $\ten{A}, \ten{B}$ yielding a $2r$-tensor $\ten{A}\otimes\ten{B}$.

Finally, the gradient of a function $\ten{F}(\ten{X})$ of any shape, univariate, multivariate, or tensor-valued, with argument $\ten{X}$ of any shape is defined to be the $p\times q$ matrix
\begin{displaymath}
    \nabla_{\ten{X}}\ten{F} = \frac{\partial\t{(\vec\ten{F}(\ten{X}))}}{\partial(\vec\ten{X})},
\end{displaymath}
where $\vec{\ten{X}}\in\mathbb{R}^p$ and $\vec\ten{F}(\ten{X})\in\mathbb{R}^q$. This is consistent with the gradient of a real-valued function $f(\mat{x})$ where $\mat{x}\in\mathbb{R}^p$ as $\nabla_{\mat{x}}f\in\mathbb{R}^{p\times 1}$ \cite[][Ch.~15]{Harville1997}.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Problem Formulation}\label{sec:problem-formulation}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Our goal is to reduce the complexity of inferring the cumulative distribution function (cdf) $F$ of $Y\mid \ten{X}$, where $\ten{X}$ is assumed to admit $r$-tensor structure of dimension $p_1\times ... \times p_r$ with continuous or discrete entries, and the response  $Y$ is unconstrained. To this end, we assume there exists a tensor-valued function of lower dimension $\ten{R}:\ten{X}\mapsto \ten{R}(\ten{X})$ such that
\begin{displaymath}
    F(Y\mid \ten{X}) = F(Y\mid \ten{R}(\ten{X})).
\end{displaymath}
where the tensor-valued $\ten{R}(\ten{X})$ has dimension  $q_1\times...\times q_r$ with $q_j\leq p_j$, $j = 1, ..., r$, which represents a dimension reduction along every mode of $\ten{X}$.  Since $\ten{R}(\ten{X})$ replaces the predictors without changing the conditional cdf of $Y\mid \ten{X}$, it is a \emph{sufficient reduction} for the regression $Y\mid\ten{X}$. This formulation is flexible as it allows, for example, to select ``important'' modes by reducing ``unimportant'' modes to be $1$ dimensional.

To find such a reduction $\ten{R}$, we leverage the equivalence relation pointed out in \cite{Cook2007},
\begin{equation}\label{eq:inverse-regression-sdr}
    Y\mid\ten{X} \,{\buildrel d \over =}\, Y\mid \ten{R}(\ten{X})
        \quad\Longleftrightarrow\quad
    \ten{X}\mid(Y, \ten{R}(\ten{X})) \,{\buildrel d \over =}\,  \ten{X}\mid\ten{R}(\ten{X}).
\end{equation}
According to \eqref{eq:inverse-regression-sdr}, a \textit{sufficient statistic} $\ten{R}(\ten{X})$ for $Y$ in the inverse regression $\ten{X}\mid Y$, where $Y$ is considered as a parameter indexing the model, is also a \textit{sufficient reduction} for $\ten{X}$ in the forward regression $Y\mid\ten{X}$.

The factorization theorem is the usual tool to identify sufficient statistics and requires a distributional model. We assume that $\ten{X}\mid Y$ is a full rank quadratic exponential family with density
\begin{align}
    f(\ten{X}\mid Y = y)
    &= h(\ten{X})\exp(\t{\mat{\eta}_y}\mat{t}(\ten{X}) - b(\mat{\eta}_y)) \notag \\
    &= h(\ten{X})\exp(\langle \vec(\ten{X}), \mat{\eta}_{1y} \rangle + \langle \vech(\t{(\vec\ten{X})}(\vec\ten{X})), \mat{\eta}_{2y} \rangle - b(\mat{\eta}_{y})), \label{eq:quad-density}
\end{align}
where $\mat{t}(\ten{X}) = (\vec\ten{X}, \vech(\t{(\vec\ten{X})}(\vec\ten{X})))$ with $\vech{\mat{A}}$ denoting the \emph{half-vectorization} of a matrix $\mat{A}$. The dependence of $\ten{X}$ on $Y$ is fully captured in the natural parameter $\mat{\eta}_y$.\footnote{$\mat{\eta}_y$ is a function of the response $Y$, so it is not a parameter in the formal statistical sense. We view it as a parameter in order to leverage \eqref{eq:inverse-regression-sdr} and derive a sufficient reduction from the inverse regression.}  The function $h$ is non-negative real-valued and $b$ is assumed to be at least twice continuously differentiable and strictly convex. An important feature of the \emph{quadratic exponential family} is that the distribution of its members is fully characterized by their first two moments. Distributions within the quadratic exponential family include the \emph{multi-linear normal} (\cref{sec:tensor-normal-estimation}) and \emph{multi-linear Ising model} (\cref{sec:ising_estimation}, a generalization of the (inverse) Ising model which is a multi-variate Bernoulli with up to second-order interactions) and mixtures of these two.

It is straightforward to generalize the formulation from the quadratic exponential family to incorporate higher moments. This, though, would make the number of parameters prohibitively large to estimate. This is why we opted for the quadratic exponential family in our formulation.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{The Generalized Multi-Linear Model}\label{sec:gmlm-model}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The dependence of $\ten{X}$ and $Y$ in model \eqref{eq:quad-density} is absorbed in $\mat{\eta}_y$, and $\mat{t}(\ten{X})$ is the minimal sufficient statistic for $\mat{\eta}_y = (\mat{\eta}_{1y}, \mat{\eta}_{2y})$. The first natural parameter component, $\mat{\eta}_{1y}$, captures the first order, and $\mat{\eta}_{2y}$, the second-order relationship of $Y$ and $\ten{X}$.

By the equivalence in \eqref{eq:inverse-regression-sdr}, in order to find the sufficient reduction $\ten{R}(\ten{X})$ we need to infer $\mat{\eta}_{1y}$, and $\mat{\eta}_{2y}$. This is reminiscent of generalized linear modeling, which we extend to a multi-linear formulation next, while deferring requirements on the parameterization to \cref{sec:manifolds}.

Suppose $\ten{F}_y$ is a known mapping of $y$ with zero expectation, $\E_Y\ten{F}_Y = 0$. We assume the dependence of $\ten{X}$ and $Y$ is reflected only in $\mat{\eta}_{1y}$ and let
\begin{equation}\label{eq:eta1-manifold}
    \mat{\eta}_{1y} = \vec{\overline{\ten{\eta}}} + \mat{B}\vec\ten{F}_y,
\end{equation}
where $\overline{\ten{\eta}}\in\mathbb{R}^{p_1\times\ldots\times p_r}$ and $\mat{B}\in\mathbb{R}^{p\times q}$. The second parameter component $\mat{\eta}_{2y} = \mat{\eta}_{2}$ is assumed to be independent of $Y$ and related to a symmetric matrix $\mat{\Omega}\in\mathbb{R}^{p\times p}$ as
\begin{equation}\label{eq:eta2-manifold}
    \mat{\eta}_{2} = c\t{\mat{D}_p}\vec\mat{\Omega},
\end{equation}
with $c\in\mathbb{R}$ a non-zero known constant determined by the distribution to ease modeling. The matrix $\mat{D}_p$ is the \emph{duplication matrix} \cite[][Ch.~11]{AbadirMagnus2005}, defined so that $\mat{D}_p\vech \mat{A} = \vec \mat{A}$ for every symmetric $p\times p$ matrix $\mat{A}$ that ensures the formulation of the density in the newly introduced parameters to have a nicer form. For example, for the multi-variate normal distribution, if the constant $c$ is chosen as $-1 / 2$, then $\mat{\Omega}$ is the inverse covariance.

To relate individual modes of $\ten{X}$ to the response through a multi-linear relation to $\ten{F}_y$, we assume $\ten{F}_y$ takes values in $\mathbb{R}^{q_1\times ...\times q_r}$; that is, $\ten{F}_y$ is tensor-valued. This, in turn, leads to imposing a corresponding Kronecker structure to the regression parameter $\mat{B}$ and \eqref{eq:eta1-manifold} becomes
\begin{equation}\label{eq:eta1}
    \mat{\eta}_{1y} = \vec\biggl(\overline{\ten{\eta}} + \ten{F}_y\mlm_{j = 1}^{r}\mat{\beta}_j\biggr),
\end{equation}
with $\mat{B} = \bigotimes_{j = r}^{1}\mat{\beta}_j$ and component matrices $\mat{\beta}_j\in\mathbb{R}^{p_j\times q_j}$ for $j = 1, \ldots, r$.

As the bilinear form of the matrix normal requires its covariance be separable, the multi-linear structure of $\ten{X}$ also induces separability on its covariance structure (see, e.g., \cite{Hoff2011}). Therefore, we further assume that
\begin{equation}\label{eq:eta2}
    \mat{\eta}_{2} = c\t{\mat{D}_p}\vec\bigotimes_{j = r}^{1}\mat{\Omega}_j,
\end{equation}
where $\mat{\Omega}_j\in\mathbb{R}^{p_j\times p_j}$ are symmetric for $j = 1, \ldots, r$.

Writing the density in \eqref{eq:quad-density} in terms of the new parameters $\overline{\ten{\eta}}$,  $\mat{\beta}_j$, $\mat{\Omega}_j$ for $j = 1, \ldots, r$, obtains
\begin{equation}\label{eq:gmlm-density}
    f(\ten{X}\mid Y = y)
    = h(\ten{X})\exp\biggl(
        \biggl\langle \ten{X}, \overline{\ten{\eta}} + \ten{F}_y\mlm_{j = 1}^r\mat{\beta}_j \biggr\rangle + c\biggl\langle \ten{X}, \ten{X}\mlm_{j = 1}^r\mat{\Omega}_j \biggr\rangle - b(\mat{\eta}_y)
    \biggr),
\end{equation}
where $\mat{\eta}_y = \mat{\eta}_y(\overline{\ten{\eta}}$, $\mat{\beta}_1$, $\ldots$, $\mat{\beta}_r$, $\mat{\Omega}_1$, $\ldots$, $\mat{\Omega}_r)$ is a well defined function.
The density of $\ten{X}$ given $Y$ in \eqref{eq:gmlm-density} is now indexed by these new parameters and sets the problem in the framework of generalized linear modeling based on a mode-wise linear relation between the predictors $\ten{X}$ and the response $Y$, which we call the \emph{Generalized Multi-Linear Model} (GMLM). Under the GMLM inverse regression model, a sufficient reduction for the forward regression of $Y$ on $\ten{X}$ is given in \cref{thm:sdr}.

\begin{theorem}[\hyperlink{proof:sdr}{SDR}]\label{thm:sdr}
    A sufficient reduction for the regression $Y\mid \ten{X}$ under the quadratic exponential family inverse regression model \eqref{eq:gmlm-density} is
    \begin{equation}\label{eq:sdr}
        \ten{R}(\ten{X}) = (\ten{X} - \E\ten{X})\mlm_{k = 1}^{r}\t{\mat{\beta}_j}.
    \end{equation}
    The reduction \eqref{eq:sdr} is minimal if all $\mat{\beta}_j$ are full rank for $j=1,\ldots,r$.
\end{theorem}

The reduction \eqref{eq:sdr} in vectorized form is $\vec\ten{R}(\ten{X})=\t{\mat{B}}\vec(\ten{X} - \E\ten{X})$. \cref{thm:sdr} shows that the \emph{sufficient reduction} $\ten{R}(\ten{X})$ reduces $\ten{X}$ along each mode (dimension) linearly. The graph in \cref{fig:SDRvisual} is a visual representation of the sufficient reduction for a $3$-dimensional tensor-valued predictor. We  provide the simplified version of the GMLM model and the sufficient reduction in \crefrange{ex:vector-valued}{ex:matrix-valued} for the special cases of vector- and matrix-valued predictors.

\begin{figure}[!hpt]
    \centering
    \includegraphics[width=0.5\textwidth]{images/reduction.pdf}
    \caption{\label{fig:SDRvisual}Visual depiction of the sufficient reduction in \cref{thm:sdr}.}
\end{figure}

\begin{example}[Vector valued $\mat{x}$ ($r = 1$)]\label{ex:vector-valued}
    For a vector-valued predictor $\mat{X}\in\mathbb{R}^{p_1}$, the density \eqref{eq:gmlm-density} reduces to
    \begin{align*}
        f(\mat{x}\mid Y = y) &= h(\mat{x})\exp(\langle\mat{x}, \overline{\mat{\eta}} + \mat{\beta}_1\mat{f}_y\rangle + c\langle\mat{x}, \mat{\Omega}_1\mat{x}\rangle - b(\mat{\eta}_y)) \\
        &= h(\mat{x})\exp( \t{(\overline{\mat{\eta}} + \mat{\beta}_1\mat{f}_y)}\mat{x} + c\t{\mat{x}}\mat{\Omega}_1\mat{x} - b(\mat{\eta}_y) )
    \end{align*}
    where $\mat{f}_y\in\mathbb{R}^{q_1}$ is vector-valued as well. The sufficient reduction obtained by \cref{thm:sdr} is then $\mat{R}(\mat{x}) = \t{\mat{\beta}_1}(\mat{x} - \E\mat{X})\in\mathbb{R}^{q_1}$ and $\mat{B} = \mat{\beta}_1\in\mathbb{R}^{p_1\times q_1}$.
\end{example}

\begin{example}[Matrix-valued $\mat{X}$ ($r = 2$)]\label{ex:matrix-valued}
    Assuming $\mat{X}$ is matrix-valued, which requires $\mat{F}_Y\in\mathbb{R}^{q_1\times q_2}$ to also be matrix-valued. Then, the density \eqref{eq:gmlm-density} has the form
    \begin{align*}
        f(\mat{x}\mid Y = y)
            &= h(\mat{x})\exp(\langle\mat{x}, \overline{\mat{\eta}} + \mat{\beta}_1\mat{F}_y\t{\mat{\beta}_2}\rangle + c\langle\mat{x}, \mat{\Omega}_1\mat{x}\mat{\Omega}_2\rangle - b(\mat{\eta}_y)) \\
            &= h(\mat{x})\exp(\tr((\overline{\mat{\eta}} + \mat{\beta}_1\mat{F}_y\t{\mat{\beta}_2})\t{\mat{x}}) + c \tr(\mat{\Omega}_1\mat{x}\mat{\Omega}_2\t{\mat{x}}) - b(\mat{\eta}_y(\mat{\theta})))
    \end{align*}
    where $\tr(\mat{A})$ is the trace of a square matrix $\mat{A}$. By \Cref{thm:sdr}, the sufficient reduction is $\mat{R}(\mat{X}) = \t{\mat{\beta}_1}(\mat{X} - \E\mat{X})\mat{\beta}_2\in\mathbb{R}^{q_1\times q_2}$, or in vector form, $\vec\mat{R}(\mat{X}) = \t{\mat{B}}\vec(\mat{X} - \E\mat{X})$ with $\mat{B} = \mat{\beta}_2\otimes\mat{\beta}_1\in\mathbb{R}^{p_1 p_2\times q_1 q_2}$.
\end{example}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Manifolds and Parameter Spaces}\label{sec:manifolds}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\cref{thm:sdr} finds the sufficient reduction for the regression of $Y$ on $\ten{X}$ in the population under the inverse GMLM \eqref{eq:gmlm-density}. In practice we need to estimate the mode-wise reduction matrices $\mat{\beta}_j$  in the GMLM \eqref{eq:gmlm-density}. As we operate within the framework of the exponential family, we opt for maximum likelihood estimation (MLE). In a classic generalized linear model setting, this is straightforward and yields well defined MLEs. In our setting, though, a major problem is due to the fact that our GMLM parameterization in \eqref{eq:gmlm-density} is \emph{not} identifiable. This is a direct consequence of the identity $\mat{\beta}_2\otimes\mat{\beta}_1 = (a\mat{\beta}_2)\otimes (a^{-1}\mat{\beta}_1)$ for any $a\neq 0$ (the same holds for the $\mat{\Omega}_j$s) so that different parameterizations of \eqref{eq:gmlm-density} describe the same density. In other words, we do \emph{not} have a one-to-one relation between the parameters and the GMLM and consistency cannot be established. Without consistency, derivation of the asymptotic distribution for the maximum likelihood estimator becomes infeasible.

To resolve this issue, we need to disambiguate the GMLM parameters to reestablish a one-to-one relation to the model. At the same time, we want to keep the mode-wise parameters $\mat{\beta}_j$ in \eqref{eq:gmlm-density} as those are the mode-wise reduction matrices needed by \cref{thm:sdr}. Using the mode-wise GMLM parameters has further advantages: the total number of parameters to be estimated is much smaller in many settings. Specifically, in the case of the multi-linear normal, a very efficient estimation algorithm is applicable. Moreover, the required number of observations for a reliable estimate is very small, potentially even smaller than any of the axis-dimensions $p_j$. We also gain significant estimation accuracy.

In the derivation of the GMLM we first introduced the parameters $\overline{\eta}$ and $\mat{B}$,  which models a linear relation between $\ten{X}$ and $Y$ through $\mat{\eta}_{1y}$ in \eqref{eq:eta1-manifold}, and  the symmetric matrix $\mat{\Omega}$ to replace $\mat{\eta}_2$ in \eqref{eq:eta2-manifold}. Then, we modeled $\mat{B}$ using the mode-wise component matrices $\mat{\beta}_j$ which impose a non-linear constraint on $\mat{B} = \bigotimes_{j = r}^1\mat{\beta}_j$. Similarly, the introduction of the $\mat{\Omega}_j$'s in $\mat{\Omega} = \bigotimes_{j = r}^1 \mat{\Omega}_j$ constrains nonlinearly $\mat{\Omega}$. Both unconstrained $\mat{B}$ and $\vech(\mat{\Omega})$ are identifiable: The GMLM density \eqref{eq:gmlm-density} corresponds to one and only one  $\mat{B}$ and $\vech(\mat{\Omega})$. Additionally, given any $\mat{\beta}_j$'s and $\mat{\Omega}_j$'s, then $\mat{B}$ and $\vech(\mat{\Omega})$ are uniquely determined. Based on these observations we derive the asymptotic behavior of the parameters $\mat{B}$ and $\vech{\mat{\Omega}}$ while operating with their components $\mat{\beta}_j$ and $\mat{\Omega}_j$. As a result we obtain a parameter space $\Theta$ with a non-linear constraint.

Except for identifiable parameters, asymptotic normality (see \cref{thm:asymptotic-normality-gmlm} in  \cref{sec:statprop})  requires differentiation. Therefore, the space itself must admit the definition of differentiation, which is usually a vector space. This is too strong an assumption for our purposes. To weaken the vector space assumption, we consider \emph{smooth manifolds}. These are spaces that look like Euclidean spaces locally and allow the notion of differentiation. The more general \emph{topological} manifolds are too weak for differentiation. A smooth manifold only allows only for first derivatives. Without going into details, the solution is a \emph{Riemannian manifold} \cite[]{Lee2012,Lee2018,AbsilEtAl2007}. Similar to an abstract \emph{smooth manifold}, Riemannian manifolds are detached from our usual intuition as well as are complicated to handle. This is where an \emph{embedded (sub)manifold} comes to the rescue. Simply speaking, an embedded manifold is a manifold that is a subset of a manifold from which it inherits its properties. If a manifold is embedded in a Euclidean space, almost all the complications of abstract manifold theory simplify drastically. Moreover, since an Euclidean space is itself a Riemannian manifold, we inherit the means for higher derivatives. Finally, a smooth embedded submanifold structure for the parameter space maintains consistency with existing approaches and results for parameter sets with linear subspace structure. These reasons justify the constraint that the parameter space $\Theta$ be a \emph{smooth embedded manifold} in a Euclidean space.

We now define a \emph{smooth manifold} embedded in $\mathbb{R}^p$, avoiding unnecessary detours into the more general theory (see \cite{Kaltenbaeck2021}).

\begin{definition}[Embedded smooth manifold]\label{def:manifold}
    A set $\manifold{A}\subseteq\mathbb{R}^p$ is an \emph{embedded smooth manifold} of dimension $d$ if for every $\mat{x}\in\manifold{A}$ there exists a smooth\footnote{Here \emph{smooth} means infinitely differentiable or $C^{\infty}$.} bi-continuous map $\varphi:U\cap\manifold{A}\to V$, called a \emph{chart}, with $\mat{x}\in U\subseteq\mathbb{R}^p$ open and $V\subseteq\mathbb{R}^d$ open.
\end{definition}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Kronecker Product Manifolds}\label{sec:kron-manifolds}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

As a basis to ensure that the constrained parameter space $\Theta$ is a manifold, without having to check every case separately, we provide two simple criteria that can be used to construct manifolds obeying the Kronecker product constraint. We need the concept of a \emph{spherical} set, which is a set $\manifold{A}$, on which the Frobenius norm, $\|\,.\,\|_F:\manifold{A}\to\mathbb{R}$, is constant. Also, we call a scale invariant set $\manifold{A}$ a \emph{cone}; that is, $\manifold{A} = \{ c \mat{A} : \mat{A}\in\manifold{A} \}$ for all $c > 0$.

\begin{theorem}[\hyperlink{proof:kron-manifolds}{Kronecker Product Manifolds}]\label{thm:kron-manifolds}
    Let $\manifold{A}\subseteq\mathbb{R}^{p_1\times q_1}\backslash\{\mat{0}\}, \manifold{B}\subseteq\mathbb{R}^{p_2\times q_2}\backslash\{\mat{0}\}$ be smooth embedded submanifolds. Assume one of the following conditions holds:
    \begin{itemize}
        \item[-] ``sphere condition'':
            At least one of $\manifold{A}$ or $\manifold{B}$ is \emph{spherical} and let $d = \dim\manifold{A} + \dim\manifold{B}$.
        \item[-] ``cone condition'':
            Both $\manifold{A}$ and $\manifold{B}$ are \emph{cones} and let $d = \dim\manifold{A} + \dim\manifold{B} - 1$.
    \end{itemize}
    Then, $\{ \mat{A}\otimes \mat{B} : \mat{A}\in\manifold{A}, \mat{B}\in\manifold{B} \}\subset\mathbb{R}^{p_1 p_2\times q_1 q_2}$ is a smooth embedded $d$-manifold.
\end{theorem}

With \cref{thm:kron-manifolds} we can obtain sufficient conditions for the construction of a constrained parameter manifold.

\begin{theorem}[\hyperlink{proof:param-manifold}{Parameter Manifolds}]\label{thm:param-manifold}
    Let
    \begin{displaymath}
        \manifold{K}_{\mat{B}} = \Bigl\{ \bigkron_{k = r}^{1}\mat{\beta}_k : \mat{\beta}_k\in\manifold{B}_k \Bigr\}
        \quad\text{and}\quad
        \manifold{K}_{\mat{\Omega}} = \Bigl\{ \bigkron_{k = r}^{1}\mat{\Omega}_k : \mat{\Omega}_k\in\manifold{O}_k \Bigr\}
    \end{displaymath}
    where $\manifold{B}_k\subseteq\mathbb{R}^{p_k\times q_k}\backslash\{\mat{0}\}$ and $\manifold{O}_k\subseteq\SymMat{p_k}\backslash\{\mat{0}\}$ are smooth embedded manifolds that are either spheres or cones, for $k = 1, ..., r$. Then, the \emph{constrained parameter space} \[ \Theta = \mathbb{R}^p \times \manifold{K}_{\mat{B}}\times\vech(\manifold{K}_{\mat{\Omega}})\subset\mathbb{R}^{p(p + 2 q + 3) / 2}\]
    as well as $\manifold{K}_{\mat{B}}$, $\manifold{K}_{\mat{\Omega}}$ and $\vech{\manifold{K}_{\mat{\Omega}}}$ are smooth embedded manifolds, where $\vech(\manifold{K}_{\mat{\Omega}}) = \{ \vech{\mat{\Omega}} : \mat{\Omega}\in\manifold{K}_{\mat{\Omega}} \}$.
\end{theorem}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Matrix Manifolds}\label{sec:matrix-manifolds}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
A powerful feature of \cref{thm:param-manifold} is the modeling flexibility it provides. For example, we can perform low-rank regression. Or, we may constrain two-way interactions between direct axis neighbors by using band matrices for the $\mat{\Omega}_k$'s, among others.

This flexibility derives from many different matrix manifolds that can be used as building blocks $\manifold{B}_k$ and $\manifold{O}_k$ of the parameter space $\Theta$ in \cref{thm:param-manifold}. A list of possible choices, among others, is given in \cref{tab:matrix-manifolds}. As long as parameters in $\Theta$ are a valid parameterization of a density (or probability mass function) of \eqref{eq:quad-density} subject to \eqref{eq:eta1-manifold} and \eqref{eq:eta2-manifold}, one may choose any of the manifolds listed in \cref{tab:matrix-manifolds} which are either cones or spherical. We also include an example which is neither a sphere nor a cone. They may also be valid building blocks but require more work as they are not directly leading to a parameter manifold by \cref{thm:param-manifold}. If the resulting parameter space $\Theta$ is an embedded manifold, the asymptotic theory of \cref{sec:statprop} is applicable.

\begin{table}
    \caption{\label{tab:matrix-manifolds}Examples of embedded matrix manifolds. ``Symbol'' a (more or less) common notation for the matrix manifold, if at all. ``C'' stands for \emph{cone}, meaning it is scale invariant. ``S'' means \emph{spherical}, that is, constant Frobenius norm.}
    \begin{tabular}{l l c c c}
        \hline
        Symbol & Description & C & S & Dimension\\
        \hline
        $\mathbb{R}^{p\times q}$ & All matrices of dimension $p\times q$ &
            \checkmark & \xmark     & $p q$ \\
            $\mathbb{R}_{*}^{p\times q}$ & Full rank $p\times q$ matrices &
            \checkmark & \xmark     & $p q$                  \\
        $\Stiefel{p}{q}$ & \emph{Stiefel Manifold}, $\{ \mat{U}\in\mathbb{R}^{p\times q} : \t{\mat{U}}\mat{U} = \mat{I}_q \}$ for $q\leq p$ &
            \xmark     & \checkmark & $p q - q (q + 1) / 2$  \\
        $\mathcal{S}^{p - 1}$ & Unit sphere in $\mathbb{R}^p$, special case $\Stiefel{p}{1}$ &
            \xmark     & \checkmark & $p - 1$                \\
        $\OrthogonalGrp{p}$ & Orthogonal Group, special case $\Stiefel{p}{p}$ &
            \xmark     & \checkmark & $p (p - 1) / 2$        \\
        $\SpecialOrthogonalGrp{p}$ & Special Orthogonal Group $\{ \mat{U}\in U(p) : \det{\mat{U}} = 1 \}$ &
            \xmark     & \checkmark & $p (p - 1) / 2$        \\
        $\mathbb{R}_{r}^{p\times q}$ & Matrices of known rank $r > 0$, generalizes $\StiefelNonCompact{p}{q}$ &
            \checkmark & \xmark     & $r(p + q - r)$         \\
        $\SymMat{p}$ & Symmetric matrices &
            \checkmark & \xmark     & $p (p + 1) / 2$    \\
        $\SymPosDefMat{p}$ & Symmetric Positive Definite matrices &
            \checkmark & \xmark     & $p (p + 1) / 2$    \\
        & Scaled Identity $\{ a\mat{I}_p : a\in\mathbb{R}_{+} \}$ &
            \checkmark & \xmark     & $1$    \\
        & Symmetric $r$-band matrices (includes diagonal) &
            \checkmark & \xmark     & $(2 p - r) (r + 1) / 2$    \\
        & Auto correlation $\{ \mat{A}\in\mathbb{R}^{p\times p} : \mat{A}_{i j} = \rho^{|i - j|}, \rho\in(0, 1) \}$ &
            \xmark     & \xmark     & $1$ \\
        \hline
    \end{tabular}
\end{table}

\begin{remark}
    The \emph{Grassmann Manifold} of $q$ dimensional subspaces in $\mathbb{R}^p$ is not listed in \cref{tab:matrix-manifolds} since it is not embedded in $\mathbb{R}^{p \times q}$.
\end{remark}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Maximum Likelihood Estimation}\label{sec:ml-estimation}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Suppose $(\ten{X}_i, Y_i)$ are independently and identically distributed with joint cdf $F(\ten{X}, Y)$, for $i = 1, \ldots, n$. The empirical log-likelihood function of \eqref{eq:gmlm-density}, ignoring terms not depending on the parameters, is
\begin{equation}\label{eq:log-likelihood}
    l_n(\mat{\theta}) = \frac{1}{n}\sum_{i = 1}^n \biggl(\Bigl\langle\ten{X}_i, \overline{\ten{\eta}} + \ten{F}_{y_i}\mlm_{k = 1}^{r}\mat{\beta}_k \Bigr\rangle + c\Bigl\langle\ten{X}_i, \ten{X}_i\mlm_{k = 1}^{r}\mat{\Omega}_k \Bigr\rangle - b(\mat{\eta}_{y_i})\biggr).
\end{equation}
The maximum likelihood estimate of $\mat{\theta}_0=(\vec\overline{\ten{\eta}}_0, \vec\mat{B}_0, \vech\mat{\Omega}_0)$ is the solution to the optimization problem
\begin{equation}\label{eq:mle}
    \hat{\mat{\theta}}_n = \argmax_{\mat{\theta}\in\Theta}l_n(\mat{\theta})
\end{equation}
with $\hat{\mat{\theta}}_n = (\vec\widehat{\overline{\ten{\eta}}}, \vec\widehat{\mat{B}}, \vech\widehat{\mat{\Omega}})$ where $\widehat{\mat{B}} = \bigkron_{k = r}^{1}\widehat{\mat{\beta}}_k$ and $\widehat{\mat{\Omega}} = \bigkron_{k = r}^{1}\widehat{\mat{\Omega}}_k$.

In a classical \emph{generalized linear model} (GLM), the link function connecting the natural parameters to the expectation of the sufficient statistic $\mat{\eta}_y = \mat{g}(\E[\mat{t}(\ten{X}) \mid Y = y])$ is invertible. Such a link may not exist in our setting, but for our purpose it is sufficient to know the conditional first $\E_{\mat{\theta}}[\ten{X} \mid Y = y]$  and second $\E_{\mat{\theta}}[\ten{X}\circ\ten{X} \mid Y = y]$ moments, given a parameterization $\mat{\theta}$ of \eqref{eq:gmlm-density}.

Gradient descent is a powerful and widely used optimization algorithm to compute MLEs. We compute the gradients of $l_n$ in \cref{thm:grad}.

\begin{theorem}[\hyperlink{proof:grad}{Likelihood Gradient}]\label{thm:grad}
    Suppose $(\ten{X}_i, y_i), i = 1, ..., n$, are i.i.d. with conditional log-likelihood of the form \eqref{eq:log-likelihood}, where $\mat{\theta}$ denotes the collection of all GMLM parameters $\overline{\ten{\eta}}$, ${\mat{B}} = \bigkron_{k = r}^{1}{\mat{\beta}}_k$ and ${\mat{\Omega}} = \bigkron_{k = r}^{1}{\mat{\Omega}}_k$ for $k = 1, ..., r$. Then, the partial gradients with respect to $\overline{\ten{\eta}}, \mat{\beta}_1, \ldots, \mat{\beta}_r, \mat{\Omega}_1, \ldots, \mat{\Omega}_r$ are given by
    \begin{align*}
        \nabla_{\overline{\ten{\eta}}}l_n &\equiv \frac{1}{n}\sum_{i = 1}^n (\ten{X}_i - \E_{\mat{\theta}}[\ten{X} \mid Y = y_i]), \\
        \nabla_{\mat{\beta}_j}l_n &\equiv \frac{1}{n}\sum_{i = 1}^n (\ten{X}_i - \E_{\mat{\theta}}[\ten{X} \mid Y = y_i])_{(j)}\t{\Big(\ten{F}_{y_i}\mlm_{\substack{k = 1\\k\neq j}}^r\mat{\beta}_k\Big)_{(j)}}, \\
        \nabla_{\mat{\Omega}_j}l_n &\equiv \frac{c}{n}\sum_{i = 1}^n (\ten{X}_i\otimes\ten{X}_i - \E_{\mat{\theta}}[\ten{X}\otimes\ten{X} \mid Y = y_i])\mlm_{\substack{k = 1\\k\neq j}}^r\t{(\vec{\mat{\Omega}_k})},
    \end{align*}
    so that  $\nabla l_n = (\nabla_{\overline{\ten{\eta}}}l_n, \nabla_{\mat{\beta}_1}l_n, \ldots, \nabla_{\mat{\beta}_r}l_n, \nabla_{\mat{\Omega}_1}l_n, \ldots, \nabla_{\mat{\Omega}_r}l_n)$.
\end{theorem}

The partial gradients $\nabla_{\mat{\Omega}_j}l_n$ contain the \emph{Kronecker} product $\ten{X}\otimes\ten{X}$ of tensor-valued predictors $\ten{X}$ instead of the outer product, which differ only in the order of their elements and shape.

Although  any GMLM can be fitted via gradient descent using \cref{thm:grad}, this may be computationally inefficient. In the special case of multi-linear normal predictors, an iterative cyclic updating scheme that converges fast, is stable, and does not require hyperparameters is derived in \cref{sec:tensor-normal-estimation}. On the other hand, the Ising model does not allow such a scheme, requiring a gradient-based method.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Multi-Linear Normal}\label{sec:tensor-normal-estimation}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The \emph{multi-linear normal} is the extension of the matrix normal to tensor-valued random variables and a member of the quadratic exponential family \eqref{eq:quad-density} under \eqref{eq:eta2}. \cite{Dawid1981} and \cite{Arnold1981} introduced the term matrix normal and, in particular, \cite{Arnold1981} provided several theoretical results, such as its density, moments and conditional distributions of its components. The matrix normal distribution is  a bilinear normal distribution; a distribution of a two-way (two-component) array, each component representing a vector of observations \cite[]{OhlsonEtAl2013}. \cite{KolloVonRosen2005,Hoff2011,OhlsonEtAl2013} presented the extension of the bilinear to the multi-linear normal distribution, using a parallel extension of bilinear matrices to multi-linear tensors \cite[]{Comon2009}.

The defining feature of the matrix normal distribution, and its multi-linear extension, is the Kronecker product structure of its covariance. This formulation, where the vectorized variables are normal with multiway covariance structure modeled as a Kronecker product of matrices of much lower dimension, aims to overcome the significant modeling and computational challenges arising from the high computational complexity of manipulating tensor representations \cite[see, e.g.,][]{HillarLim2013,WangEtAl2022}.

Suppose the conditional distribution of tensor $\ten{X}$ given $Y$ is multi-linear normal  with mean $\ten{\mu}_y$ and covariance $\mat{\Sigma} = \bigkron_{k = r}^{1}\mat{\Sigma}_k$. We assume the distribution is non-degenerate which means that the covariances $\mat{\Sigma}_k$ are symmetric positive definite matrices. Its density is
\begin{displaymath}
    f_{\mat{\theta}}(\ten{X}\mid Y = y) = (2\pi)^{-p / 2}\prod_{k = 1}^{r}\det(\mat{\Sigma}_k)^{-p / 2 p_k}\exp\biggl( -\frac{1}{2}\biggl\langle\ten{X} - \ten{\mu}_y, (\ten{X} - \ten{\mu}_y)\mlm_{k = 1}^{r}\mat{\Sigma}_k^{-1} \biggr\rangle \biggr).
\end{displaymath}
For the sake of simplicity and w.l.o.g., we assume $\ten{X}$ has 0 marginal expectation; i.e., $\E\ten{X} = 0$. Rewriting the multi-linear normal density in the form of the GMLM density \eqref{eq:gmlm-density} with the scaling constant $c = -1/2$ yields the parameter relations
\begin{equation}\label{eq:tnormal_cond_params}
    \ten{\mu}_y = \ten{F}_y\mlm_{k = 1}^{r}\mat{\Omega}_k^{-1}\mat{\beta}_k, \qquad
    \mat{\Omega}_k = \mat{\Sigma}_k^{-1}.
\end{equation}
Here we used that $\overline{\ten{\eta}} = 0$ due to $0 = \E\ten{X} = \E\E[\ten{X}\mid Y] = \E\ten{\mu}_Y$ in combination with $\E\ten{F}_Y = 0$. Additionally, the positive definiteness of the $\mat{\Sigma}_k$'s renders all the $\mat{\Omega}_k$'s symmetric positive definite. We obtain $\E_{\mat{\theta}}[\ten{X}\mid Y = y] = \ten{\mu}_y$, and
\begin{displaymath}
    \E_{\mat{\theta}}[\ten{X}\circ\ten{X}\mid Y = y] \equiv \bigkron_{k = r}^1\mat{\Sigma}_k + (\vec{\ten{\mu}}_y)\t{(\vec{\ten{\mu}}_y)}.
\end{displaymath}
In practice, we assume we have a random sample of $n$ observations $(\ten{X}_i, \ten{F}_{y_i})$ from the joint distribution. We start the estimation process by demeaning the data. Then, only the reduction matrices $\mat{\beta}_k$ and the scatter matrices $\mat{\Omega}_k$ need to be estimated. To solve the optimization problem \eqref{eq:mle}, with $\overline{\ten{\eta}} = 0$, we initialize the parameters using a simple heuristic approach. First, we compute moment based mode-wise marginal covariance estimates $\widehat{\mat{\Sigma}}_k(\ten{X})= \sum_{i = 1}^{n} (\ten{X}_i)_{(k)}\t{(\ten{X}_i)_{(k)}}/n$ and $\widehat{\mat{\Sigma}}_k(\ten{F}_Y)= \sum_{i = 1}^{n} (\ten{F}_{y_i})_{(k)}\t{(\ten{F}_{y_i})_{(k)}}/n$. Then, for every mode $k = 1, \ldots, r$, we compute the  eigenvectors $\mat{v}_j(\widehat{\mat{\Sigma}}_k(\ten{X}))$, $\mat{v}_j(\widehat{\mat{\Sigma}}_k(\ten{F}_Y))$ that correspond to the leading eigenvalues $\lambda_j(\widehat{\mat{\Sigma}}_k(\ten{X}))$, $\lambda_j(\widehat{\mat{\Sigma}}_k(\ten{X}))$ of the marginal covariance estimates, for $j = 1, \ldots, q_k$. We set
\begin{align*}
    \mat{U}_k &= (\mat{v}_1(\widehat{\mat{\Sigma}}_1(\ten{X})), \ldots, \mat{v}_{q_k}(\widehat{\mat{\Sigma}}_{q_k}(\ten{X}))), \,  \mat{V}_k = (\mat{v}_1(\widehat{\mat{\Sigma}}_1(\ten{F}_Y), \ldots, \mat{v}_{q_k}(\widehat{\mat{\Sigma}}_{q_k}(\ten{F}_Y))\\
    \mat{D}_k &= \diag(\mat{v}_1(\widehat{\mat{\Sigma}}_1(\ten{X}))\mat{v}_1(\widehat{\mat{\Sigma}}_1(\ten{F}_{Y})), \ldots, \mat{v}_{q_k}(\widehat{\mat{\Sigma}}_{q_k}(\ten{X}))\mat{v}_{q_k}(\widehat{\mat{\Sigma}}_k(\ten{F}_{Y}))).
\end{align*}
The initial value of $\mat{\beta}_k$ is $\hat{\mat{\beta}}_k^{(0)} = \mat{U}_k\sqrt{\mat{D}_k}\t{\mat{V}_k},$ and the initial value of $\mat{\Omega}_k$ is  set to the identity $\mat{\Omega}_k^{(0)} = \mat{I}_{p_k}$, for $k=1,\ldots,r$.
Given  $\hat{\mat{\beta}}_1, \ldots, \hat{\mat{\beta}}_r, \hat{\mat{\Omega}}_1, \ldots, \hat{\mat{\Omega}}_r$, we take the gradient $\nabla_{\mat{\beta}_j}l_n$ of the multi-linear normal log-likelihood $l_n$ in \eqref{eq:log-likelihood}  applying  \cref{thm:grad} and keep all other parameters except $\mat{\beta}_j$ fixed. Then, $\nabla_{\mat{\beta}_j}l_n = 0$ has the closed form solution
\begin{equation}\label{eq:tensor_normal_beta_solution}
    \t{\mat{\beta}_j} = \biggl(
        \sum_{i = 1}^{n}
        \Bigl( \ten{F}_{y_i}\mlm_{k \neq j}\hat{\mat{\Omega}}_k^{-1}\hat{\mat{\beta}}_k \Bigr)_{(j)}
        \t{\Bigl( \ten{F}_{y_i}\mlm_{k \neq j}\hat{\mat{\beta}}_k \Bigr)_{(j)}}
    \biggr)^{-1}
    \biggl(
        \sum_{i = 1}^{n}
        \Bigl( \ten{F}_{y_i}\mlm_{k \neq j}\hat{\mat{\beta}}_k \Bigr)_{(j)}
        \t{(\ten{X}_{i})_{(j)}}
    \biggr)
        \hat{\mat{\Omega}}_j.
\end{equation}
Equating the partial gradient of the $j$th scatter matrix $\mat{\Omega}_j$ in \cref{thm:grad} to zero ($\nabla_{\mat{\Omega}_j}l_n = 0$) gives a quadratic matrix equation due to the dependence of $\ten{\mu}_y$ on $\mat{\Omega}_j$. In practice though, it is faster, more stable,  and equally accurate to use mode-wise covariance estimates via the residuals
\begin{displaymath}
    \hat{\ten{R}}_i = \ten{X}_i - \hat{\ten{\mu}}_{y_i} = \ten{X}_i - \ten{F}_{y_i}\mlm_{k = 1}^{r}\hat{\mat{\Omega}}_k^{-1}\hat{\mat{\beta}}_k.
\end{displaymath}
The estimates are computed via $\tilde{\mat{\Sigma}}_j = \sum_{i = 1}^n (\hat{\ten{R}}_i)_{(j)} \t{(\hat{\ten{R}}_i)_{(j)}},$ where $\tilde{s}\tilde{\mat{\Sigma}}_j = \hat{\mat{\Omega}}_j^{-1}$. To decide on the scaling factor $\tilde{s}$ we use that the mean squared error has to be equal to the trace of the covariance estimate,
\begin{displaymath}
    \frac{1}{n}\sum_{i = 1}^n \langle \hat{\ten{R}}_i, \hat{\ten{R}}_i \rangle = \tr\bigkron_{k = r}^{1}\hat{\mat{\Omega}}_k^{-1} = \prod_{k = 1}^{r}\tr{\hat{\mat{\Omega}}_k^{-1}} = \tilde{s}^r\prod_{k = 1}^{r}\tr{\tilde{\mat{\Sigma}}_k},
\end{displaymath}
so that
\begin{displaymath}
    \tilde{s} = \biggl(\Bigl(\prod_{k = 1}^{r}\tr{\tilde{\mat{\Sigma}}_k}\Bigr)^{-1}\frac{1}{n}\sum_{i = 1}^n \langle \hat{\ten{R}}_i, \hat{\ten{R}}_i \rangle\biggr)^{1 / r}
\end{displaymath}
resulting in the estimates $\hat{\mat{\Omega}}_j = (\tilde{s}\tilde{\mat{\Sigma}}_j)^{-1}$.
Estimation is performed by updating the estimates $\hat{\mat{\beta}}_j$ via \eqref{eq:tensor_normal_beta_solution} for $j = 1, \ldots, r$, and then recompute the $\hat{\mat{\Omega}}_j$ estimates simultaneously keeping the $\hat{\mat{\beta}}_j$s fixed. This procedure is repeated until convergence.

A technical detail for numerical stability is to ensure that the scaled values $\tilde{s}\tilde{\mat{\Sigma}}_j$, assumed to be symmetric and positive definite, are well conditioned. Thus, we estimate the condition number of $\tilde{s}\tilde{\mat{\Sigma}}_j$ before computing the inverse. In case of ill-conditioning, we use the regularized $\hat{\mat{\Omega}}_j = (\tilde{s}\tilde{\mat{\Sigma}}_j + 0.2 \lambda_{1}(\tilde{s}\tilde{\mat{\Sigma}}_j)\mat{I}_{p_j})^{-1}$ instead, where $\lambda_{1}(\tilde{s}\tilde{\mat{\Sigma}}_j)$ is the first (maximum) eigenvalue. Experiments showed that this regularization is usually only required in the first few iterations.

If the parameter space follows a more general setting as in \cref{thm:param-manifold}, updating may produce estimates outside the parameter space. A simple and efficient method is to project every updated estimate onto the corresponding manifold.

A standard algorithm to calculate the MLE of a Kronecker product is block-coordinate descent,  proposed independently by \cite{MardiaGoodall1993} and \cite{Dutilleul1999}. It was later called ``flip-flop'' algorithm by \cite{LuZimmerman2005} for the computation of the maximum likelihood estimators of the components of a separable covariance matrix. \cite{ManceurDutilleul2013} extended the ``flip-flop'' algorithm for the computation of the MLE of the separable covariance structure of a 3-way and 4-way normal distribution and obtained a lower bound for the sample size required for its existence. The same issue was also studied by \cite{DrtonEtAl2020} in the case of a two-way array (matrix). Our algorithm uses a similar ``flip-flop'' approach by iteratively updating the $\mat{\beta}_k$'s and $\mat{\Omega}_k$'s.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Multi-Linear Ising Model}\label{sec:ising_estimation}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The Ising\footnote{Also known as the \emph{Lenz-Ising} model as the physical assumptions of the model where developed by both Lenz and Ising \cite[]{Niss2005} where Ising gave a closed form solution for the 1D lattice \cite[]{Ising1925}.} model \cite[]{Lenz1920,Ising1925,Niss2005} is a mathematical model originating in statistical physics to study ferromagnetism in a thermodynamic setting. It describes magnetic dipoles (atomic ``spins'' with values $\pm 1$) under an external magnetic field (first moments) while allowing two-way interactions (second moments) between direct neighbors on a lattice, a discrete grid. The Ising model is a member of the discrete quadratic exponential family \cite[]{CoxWermuth1994,JohnsonEtAl1997} for multivariate binary outcomes where the interaction structure (non-zero correlations) is determined by the lattice. The $p$-dimensional Ising model is a discrete probability distribution on the set of $p$-dimensional binary vectors $\mat{x}\in\{0, 1\}^p$ with probability mass function (pmf) given by
\begin{displaymath}
    P_{\mat{\gamma}}(\mat{x}) = p_0(\mat{\gamma})\exp(\t{\vech(\mat{x}\t{\mat{x}})}\mat{\gamma}).
\end{displaymath}
The scaling factor $p_0(\mat{\gamma})\in\mathbb{R}_{+}$ ensures that $P_{\mat{\gamma}}$ is a pmf. It is equal to the probability of the zero event: $P(X = \mat{0}) = P_{\mat{\gamma}}(\mat{0}) = p_0(\mat{\gamma})$. More commonly known as the \emph{partition function}, the reciprocal of $p_0$, is given by
\begin{equation}\label{eq:ising-partition-function}
    p_0(\mat{\gamma})^{-1} = \sum_{\mat{x}\in\{0, 1\}^p}\exp(\t{\vech(\mat{x}\t{\mat{x}})}\mat{\gamma}).
\end{equation}
Abusing notation, let $\mat{\gamma}_{j l}$ denote the element of $\mat{\gamma}$ corresponding to $\mat{x}_j\mat{x}_l$ in $\vech(\mat{x}\t{\mat{x}})$.\footnote{Specifically, the element $\mat{\gamma}_{j l}$ of $\mat{\gamma}$ is a short hand for $\mat{\gamma}_{\iota(j, l)}$ with $\iota(j, l) = (\min(j, l) - 1)(2 p - \min(j, l)) / 2 + \max(j, l)$ mapping the matrix row index $j$ and column index $l$ to the corresponding half vectorization indices $\iota(j, l)$.} The ``diagonal'' parameter $\mat{\gamma}_{j j}$ expresses the conditional log odds of $X_j = 1\mid X_{-j} = \mat{0}$, where the negative subscript in $X_{-j}$ describes the $p - 1$ dimensional vector $X$ with the $j$th element removed. The off diagonal entries $\mat{\gamma}_{j l}$, $j\neq l$, are equal to the conditional log odds of simultaneous occurrence $X_j = 1, X_l = 1 \mid X_{-j, -l} = \mat{0}$. More precisely, the conditional probabilities $\pi_j(\mat{\gamma}) = P_{\mat{\gamma}}(X_j = 1\mid X_{-j} = \mat{0})$ and $\pi_{j, l}(\mat{\gamma}) = P_{\mat{\gamma}}(X_j = 1, X_l = 1\mid X_{-j, -l} = \mat{0})$ are related to the natural parameters via
\begin{equation}\label{eq:ising-two-way-log-odds}
    \mat{\gamma}_{j j} = \log\frac{\pi_j(\mat{\gamma})}{1 - \pi_j(\mat{\gamma})}, \qquad
    \mat{\gamma}_{j l} = \log\frac{1 - \pi_j(\mat{\gamma})\pi_l(\mat{\gamma})}{\pi_j(\mat{\gamma})\pi_l(\mat{\gamma})}\frac{\pi_{j l}(\mat{\gamma})}{1 - \pi_{j l}(\mat{\gamma})}.
\end{equation}

Conditional Ising models, incorporating the information of covariates $Y$ into the model, were considered by \cite{ChengEtAl2014,BuraEtAl2022}. The direct way is to parameterize $\mat{\gamma} = \mat{\gamma}_y$ by the covariate $Y = y$ to model a conditional distribution $P_{\mat{\gamma}_y}(\mat{x}\mid Y = y)$.

We extend the conditional pmf by allowing the binary variables to be tensor-valued; that is, we set $\mat{x} = \vec{\ten{X}}$, with dimension $p = \prod_{k = 1}^{r}p_k$ for $\ten{X}\in\{ 0, 1 \}^{p_1\times\cdots\times p_r}$. The tensor structure of $\ten{X}$ is accommodated by assuming Kronecker product constraints to the parameter vector $\mat{\gamma}_y$ in a similar fashion as in the multi-linear normal model. This means that we compare the pmf $P_{\mat{\gamma}_y}(\vec{\ten{X}} | Y = y)$ with the quadratic exponential family \eqref{eq:quad-density} with the natural parameters modeled by \eqref{eq:eta1} and \eqref{eq:eta2}. The diagonal of $(\vec{\ten{X}})\t{(\vec{\ten{X}})}$ is equal to $\vec{\ten{X}}$, which results in the GMLM being expressed as
\begin{align}
    P_{\mat{\gamma}_y}(\ten{X} \mid Y = y)
        &= p_0(\mat{\gamma}_y)\exp(\t{\vech((\vec{\ten{X}})\t{(\vec{\ten{X}})})}\mat{\gamma}_y) \label{eq:ising-cond-prob} \\
        &= p_0(\mat{\gamma}_y)\exp\Bigl(\Bigl\langle \ten{X}, \ten{F}_y\mlm_{k = 1}^{r}\mat{\beta}_k \Bigr\rangle + \Bigl\langle\ten{X}\mlm_{k = 1}^{r}\mat{\Omega}_k, \ten{X}\Bigr\rangle\Bigr) \nonumber
\end{align}
where we set $\overline{\ten{\eta}} = 0$. This imposes an additional constraint on the model, as the diagonal elements of $\mat{\Omega} = \bigkron_{k = r}^{1}\mat{\Omega}_k$ take the role of $\overline{\ten{\eta}}$, although not fully. Based on this modeling choice, the relation between the natural parameters $\mat{\gamma}_y$ of the conditional Ising model and the GMLM parameters $\mat{\beta}_k$ and $\mat{\Omega}_k$ is
\begin{equation}\label{eq:ising-natural-params}
    \mat{\gamma}_y
        = \t{\mat{D}_p}\vec(\mat{\Omega} + \diag(\mat{B}\vec{\ten{F}_y}))
        = \t{\mat{D}_p}\vec\Biggl(\bigkron_{k = r}^{1}\mat{\Omega}_k + \diag\biggl(\vec\Bigl(\ten{F}_y\mlm_{k = 1}^{r}\mat{\beta}_k\Bigr)\biggr)\Biggr).
\end{equation}
In contrast to the multi-linear normal GMLM, the matrices $\mat{\Omega}_k$ are only required to be symmetric. More specifically, we require $\mat{\Omega}_k$, for $k = 1, \ldots, r$, to be elements of an embedded submanifold of $\SymMat{p_k}$ (see \cref{sec:kron-manifolds,sec:matrix-manifolds}). The mode-wise reduction matrices $\mat{\beta}_k$ are elements of an embedded submanifold of $\mathbb{R}^{p_k\times q_k}$. Common choices are listed in \cref{sec:matrix-manifolds}.

To solve the optimization problem \eqref{eq:mle}, given a data set $(\ten{X}_i, y_i)$, $i = 1, \ldots, n$, we use the gradients in \cref{thm:grad} and a variation of gradient descent as described in Appendix~E in the supplementary material, which is mostly technical in nature.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Statistical Properties}\label{sec:statprop}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

Let $Z$ be a random variable with density $f_{\mat{\theta_0}}\in\{ f_{\mat{\theta}}: \mat{\theta}\in\Theta \}$, where $\Theta$ is a subset of a Euclidean space. We want to estimate the parameter that indexes the pdf,  ${\mat{\theta}}_0$, using $n$ i.i.d. (independent and identically distributed) copies of $Z$. We assume a known, real-valued and measurable function $m_{\mat{\theta}}$ exists with $z\mapsto m_{\mat{\theta}}(z)$ for every $\mat{\theta}\in\Theta$, and that  ${\mat{\theta}}_0$ is the unique maximizer of the map $\mat{\theta}\mapsto M(\mat{\theta}) = \E m_{\mat{\theta}}(Z)$. For the estimation, we maximize the empirical version
\begin{align}\label{eq:Mn}
    M_n(\mat{\theta}) &= \frac{1}{n}\sum_{i = 1}^n m_{\mat{\theta}}(Z_i).
\end{align}
An \emph{M-estimator} $\hat{\mat{\theta}}_n = \hat{\mat{\theta}}_n(Z_1, ..., Z_n)$ is a maximizer for the objective function $M_n$ over the parameter space $\Theta$ defined as
\begin{displaymath}
    \hat{\mat{\theta}}_n = \argmax_{\mat{\theta}\in\Theta} M_n(\mat{\theta}).
\end{displaymath}
If  the objective function $M_n$ is the log-likelihood,  $\widehat{\mat{\theta}}_n$ is the MLE. In general, the following asymptotic theory is valid for more than just the MLE. Moreover, it is not even necessary to have a \textit{perfect} maximizer, as long as the objective has finite supremum, it is sufficient to take an \emph{almost maximizer} $\hat{\mat{\theta}}_n$ as defined in the following.

\begin{definition}[weak and strong M-estimators]
    An estimator $\hat{\mat{\theta}}_n$ for the objective function $M_n$ in \eqref{eq:Mn} with $\sup_{\mat{\theta}\in\Theta}M_n(\mat{\theta}) < \infty$ such that
    \begin{displaymath}
        M_n(\hat{\mat{\theta}}_n) \geq \sup_{\mat{\theta}\in\Theta}M_n(\mat{\theta}) - o_P(n^{-1})
    \end{displaymath}
    is called a \emph{strong M-estimator} over $\Theta$. Replacing $o_P(n^{-1})$ by $o_P(1)$ gives a \emph{weak M-estimator}.
\end{definition}

\begin{theorem}[\hyperlink{proof:asymptotic-normality-gmlm}{Asymptotic Normality}]\label{thm:asymptotic-normality-gmlm}
    Assume $Z = (\ten{X}, Y)$ satisfies model \eqref{eq:quad-density} subject to \eqref{eq:eta1-manifold} and \eqref{eq:eta2-manifold} with true constrained parameter $\mat{\theta}_0 = (\overline{\eta}_0, \mat{B}_0, \vech{\mat{\Omega}_0})\in\Theta$, where $\Theta$ is defined in \cref{thm:param-manifold}. Under the regularity Conditions 1--3 in Appendix~B in the supplementary material, there exists a strong M-estimator sequence $\hat{\mat{\theta}}_n$ deriving from $l_n$ in \eqref{eq:log-likelihood} over $\Theta$. Furthermore, any strong M-estimator $\hat{\mat{\theta}}_n$ converges in probability to the true parameter $\mat{\theta}_0$, $\hat{\mat{\theta}}_n\xrightarrow{p}\mat{\theta}_0$, over $\Theta$. Moreover, every strong M-estimator $\hat{\mat{\theta}}_n$ is asymptotically normal,
    \begin{displaymath}
        \sqrt{n}(\hat{\mat{\theta}}_n - \mat{\theta}_0) \xrightarrow{d} \mathcal{N}(0, \mat{\Sigma}_{\mat{\theta}_0})
    \end{displaymath}
    with asymptotic variance-covariance $\mat{\Sigma}_{\mat{\theta}_0}$ provided in the proof in Appendix~B of the supplementary material.
\end{theorem}

To provide an intuition for the asymptotic variance-covariance structure $\mat{\Sigma}_{\mat{\theta}_0}$, we start from the classical, non-degenerate setting of an MLE $\hat{\mat{\xi}}_n$ in an unconstrained parameter space $\Xi$ containing the true parameter $\mat{\xi}_0$. In this case, $\mat{\Sigma}_{\mat{\xi}_0}$ is symmetric positive definite. Such matrices can be associated with a hyper-ellipsoid with axes associated with the eigenvectors of $\mat{\Sigma}_{\mat{\xi}_0}$. Given the manifold parameter space $\Theta\subseteq\Xi$ with true parameter $\mat{\theta}_0 = \mat{\xi}_0$, the asymptotic variance-covariance $\mat{\Sigma}_{\mat{\theta}_0}$ is a positive semi-definite matrix associated with a (degenerate) hyper-ellipsoid resulting from intersecting the hyper-ellipsoid of $\mat{\Sigma}_{\mat{\xi}_0}$ with the tangent space of $\Theta$ at $\mat{\theta}_0 = \mat{\xi}_0$ and distorting its shape with respect to the local curvature of $\Theta$ at $\mat{\theta}_0$.

\begin{remark}
    \cref{thm:asymptotic-normality-gmlm} is a special case of a more general asymptotic normality Theorem~6 that also generalizes Theorem~5.23 in \cite{vanderVaart1998}, where $\Theta$ is an open subset of an Euclidean space, which is the simplest form of an embedded manifold. Theorem~6 is provided in Appendix~B in the supplementary material due to its technical nature.
\end{remark}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Simulations}\label{sec:simulations}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
In this section, we report simulation results for the multi-linear normal and the multi-linear Ising model where different aspects of the GMLM model are compared against other methods. These are: \textit{Tensor Sliced Inverse Regression} (TSIR) \cite[]{DingCook2015},  an extension of Sliced Inverse Regression (SIR) \cite{Li1991} to tensor-valued predictors; the \textit{Multiway Generalized Canonical Correlation Analysis} (MGCCA) \cite[]{ChenEtAl2021,GirkaEtAl2024}, an extension of canonical correlation analysis (CCA) designed to handle multi-block data with tensor  structure; and the Tucker decomposition that is a higher-order form of principal component analysis (HOPCA) \cite[]{KoldaBader2009}, for both continuous and binary data. For the latter, the binary values are treated as continuous. As part of our baseline analysis, we also incorporate traditional Principal Component Analysis (PCA) on vectorized observations. In the case of the Ising model, we also compare with LPCA (Logistic PCA) and CLPCA (Convex Logistic PCA), both introduced in \cite{LandgrafLee2020}. All experiments are performed with sample sizes $n = 100, 200, 300, 500$ and $750$. Each experiment is repeated $100$ times.

To assess the accuracy of the estimation of  $\ten{R}(\ten{X})$ in \cref{thm:sdr}, we compare the estimate with the true vectorized reduction matrix $\mat{B} = \bigkron_{k = r}^{1}\mat{\beta}_k$, as it is compatible with any linear reduction method. We compute the \emph{subspace distance}, $d(\mat{B}, \hat{\mat{B}})$, between  $\mat{B}\in\mathbb{R}^{p\times q}$ and an estimate $\hat{\mat{B}}\in\mathbb{R}^{p\times \tilde{q}}$, which satisfies
\begin{displaymath}
    d(\mat{B}, \hat{\mat{B}}) \propto \| \mat{B}\pinv{(\t{\mat{B}}\mat{B})}\t{\mat{B}} - \hat{\mat{B}}\pinv{(\t{\hat{\mat{B}}}\hat{\mat{B}})}\t{\hat{\mat{B}}} \|_F,
\end{displaymath}
where $\propto$ signifies proportional to. The proportionality constant\footnote{The proportionality constant depends on the dimension $p$ and the ranks of $\mat{B}$ and $\hat{\mat{B}}$. The explicit value of the proportionality constant is given by $(\min(\rank\mat{B} + \rank\hat{\mat{B}}, 2 p - (\rank\mat{B} + \rank\hat{\mat{B}})))^{-1/2}$.}  ensures  $d(\mat{B}, \hat{\mat{B}}) \in [0, 1]$. A distance of zero implies space overlap and a distance of one implies orthogonality of the subspaces.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Multi-Linear Normal}\label{sec:sim-tensor-normal}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
We generate a random sample $y_i$, $i=1,\ldots, n$,  from the standard normal distribution. We then draw independent samples $\ten{X}_i$  from the conditional multi-linear normal distribution of $\ten{X}\mid Y = y_i$ for $i = 1, ..., n$. The conditional distribution $\ten{X}\mid Y = y_i$ depends on the choice of the GMLM parameters $\overline{\ten{\eta}}$, $\mat{\beta}_1, ..., \mat{\beta}_r$, $\mat{\Omega}_1, ..., \mat{\Omega}_r$, and the function $\ten{F}_y$ of $y$. In all experiments we set $\overline{\ten{\eta}} = \mat{0}$. The other parameters and $\ten{F}_y$ are described per experiment. Given the true GMLM parameters and $\ten{F}_y$, we compute the conditional multi-linear normal mean $\ten{\mu}_y = \ten{F}_y\mlm_{k = 1}^{r}\mat{\Omega}_k^{-1}\mat{\beta}_k$ and covariances $\mat{\Sigma}_k = \mat{\Omega}_k^{-1}$ as in \eqref{eq:tnormal_cond_params}.

We consider the following settings:
\begin{itemize}
    \item[1a)] $\ten{X}$ is a three-way ($r = 3$) array of dimension  $2\times 3\times 5$, and $\ten{F}_y\equiv y$ is a $1\times 1\times 1$ tensor. The true $\mat{\beta}_k$'s are all equal to $\mat{e}_1\in\mathbb{R}^{p_k}$, the first unit vector, for $k \in \{1, 2, 3\}$. The matrices $\mat{\Omega}_k$ have an auto-regression like structure ($\mathrm{AR}(0.5)$), with entries $(\mat{\Omega}_k)_{i j} = 0.5^{|i - j|}$.
    \item[1b)] $\ten{X}$ is a three-way ($r = 3$) array of dimension  $2\times 3\times 5$, and relates to the response $y$ via a qubic polynomial. This is modeled via $\ten{F}_y$ of dimension $2\times 2\times 2$ by the twice iterated outer product of the vector $(1, y)$, with elements $(\ten{F}_y)_{i j k} = y^{i + j + k - 3}$. All $\mat{\beta}_k$'s are set to $(\mat{e}_1, \mat{e}_2)\in\mathbb{R}^{p_k\times 2}$ with $\mat{e}_i$ the $i$th unit vector and the $\mat{\Omega}_k$'s are $\mathrm{AR}(0.5)$.
    \item[1c)] Same as 1b), except that the GMLM parameters $\mat{\beta}_k$ are rank $1$ given by
    \begin{displaymath}
        \mat{\beta}_1 = \begin{pmatrix} 1 & -1 \\ -1 & 1 \end{pmatrix},\quad
        \mat{\beta}_2 = \begin{pmatrix} 1 & -1 \\ -1 & 1 \\ 1 & -1 \end{pmatrix},\quad
        \mat{\beta}_3 = \begin{pmatrix} 1 & -1 \\ -1 & 1 \\ 1 & -1 \\ -1 & 1 \\ 1 & -1 \end{pmatrix}.
    \end{displaymath}
    \item[1d)] Same as 1b), but  the true $\mat{\Omega}_k$ is tri-diagonal with elements $(\mat{\Omega}_k)_{i j} = \delta_{0, |i - j|} + 0.5\delta_{1, |i - j|}$, where $\delta_{i, j}$ is the Kronecker delta, for $k = 1, 2, 3$.

    \item[1e)] For the misspecification model, we let $\ten{X}\mid Y$ be multivariate  but \emph{not} multi-linear normal. Let $\ten{X}$ be a $5\times 5$ random matrix with normal entries, $Y$ a univariate standard normal and $\mat{f}_y = (1, \sin(y), \cos(y), \sin(y)\cos(y))$. The true vectorized reduction matrix $\mat{B}$ is $25\times 4$ consisting of the first $4$ columns of the identity; i.e., $B_{i j} = \delta_{i j}$. The variance-covariance matrix $\mat{\Sigma}$ has elements $\Sigma_{i j} = 0.5^{|i - j|}$. Both, $\mat{B}$ and $\mat{\Omega} = \mat{\Sigma}^{-1}$ violate the Kronecker product assumptions \eqref{eq:eta1} and \eqref{eq:eta2} of the GMLM model. We set
    \begin{displaymath}
        \vec{\ten{X}}\mid (Y = y) = \mat{B}\mat{f}_y + \mathcal{N}_{25}(\mat{0}, \mat{\Sigma}),
    \end{displaymath}
    but we fit the model with the wrong  $\ten{F}_y$: We set $\ten{F}_y$ to be a $2\times 2$ matrix with $(\ten{F}_y)_{i j} = y^{i + j - 2}$, $i,j=1,2$. Experiment 1e) involves a deliberately misspecified model designed to evaluate the robustness of our approach.
\end{itemize}


\begin{figure}[H]
    \centering
    \includegraphics[scale = 1]{plots/sim-normal-2x3.pdf}
    \caption{\label{fig:sim-normal}Multi-linear normal GMLM. The mean  subspace distance $d(\mat{B}, \hat{\mat{B}})$ over $100$ replications is plotted versus sample size on the $x$-axis. The simulation settings are described in \cref{sec:sim-tensor-normal}.}
\end{figure}

We plot the subspace distance between the true and estimated $\mat{B}$ over sample size in \cref{fig:sim-normal}. For 1a), TSIR and GMLM are equivalent and the best performers, as expected. \cite{DingCook2015} had established that TSIR obtains the MLE estimate under a multi-linear (matrix) normal distributed setting. MGCCA, HOPCA and PCA do not even come close with MGCCA being slightly better than PCA which, unexpectedly, beats HOPCA.
Continuing with 1b), where we introduced a cubic relation between $Y$ and $\ten{X}$, GMLM performs slightly better than TSIR. This is  mainly due to GMLM  estimating an $8$ dimensional subspace, which amplifies the small performance boost we gain by avoiding slicing.  The GMLM model in 1c) behaves as expected, clearly being the best. The other results are surprising. First, PCA, HOPCA and MGCCA are visually indistinguishable. This is explained by a high signal-to-noise ratio in this particular example. The most unexpected outcome is the failure of TSIR, especially because the conditional distribution $\ten{X}\mid Y$ is multi-linear normal, which, combined with $\cov(\vec\ten{X})$ exhibiting a Kronecker structure, should yield the MLE estimate. The low-rank assumption poses no challenges, as it corresponds to TSIR estimating a one-dimensional linear reduction. A well-known issue with slicing methods, as applied in TSIR, is that conditional multi-modal distributions may lead to estimation problems due to varying distribution modes causing slice means to vanish. However, this issue does not arise in simulation 1c). An investigation into this behavior revealed the problem lies in the estimation of the mode covariance matrices $\mat{O}_k = \E[(\ten{X} - \E\ten{X})_{(k)}\t{(\ten{X} - \E\ten{X})_{(k)}}]$. The mode-wise reductions in TSIR are computed as $\hat{\mat{O}}_k^{-1}\hat{\mat{\Gamma}}_k$. Poor estimation of $\mat{O}_k$ causes the failure of TSIR. This is due to the high signal-to-noise ratio in this particular simulation. GMLM exhibits excellent performance in high signal-to-noise ratio settings but may be less robust in low signal-to-noise ratio settings where TSIR can perform better, at least in this specific example. Simulation 1d), incorporating information about the covariance structure behaves similarly to 1b), except that GMLM gains a statistically significant lead in estimation performance over TSIR. The last simulation, 1e), where the model is misspecified and GMLM is at a theoretical disadvantage, all three GMLM, TSIR, and MGCCA, are on par. GMLM demonstrates a modest advantage up to a sample size of 300, whereas MGCCA takes the lead after a sample size of 500. PCA and HOPCA are both outperformed.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Ising Model}\label{sec:sim-ising}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
We let $Y_i$ be i.i.d. uniform on $[-1,1]$, $i = 1, \ldots, n$, and $\ten{X}$ be $2\times 3$ with conditional matrix (multi-linear) Ising distribution $\ten{X}\mid Y$ as in \cref{sec:ising_estimation}. Unless otherwise specified, we set the GMLM parameters to be
\begin{displaymath}
    \mat{\beta}_1 = \begin{pmatrix}
        1 & 0 \\ 0 & 1
    \end{pmatrix}, \quad \mat{\beta}_2 = \begin{pmatrix}
        1 & 0 \\ 0 & 1 \\ 0 & 0
    \end{pmatrix}, \quad \mat{\Omega}_1 = \begin{pmatrix}
        0 & -2 \\ -2 & 0
    \end{pmatrix}, \quad \mat{\Omega}_2 = \begin{pmatrix}
        1 & 0.5 & 0 \\
        0.5 & 1 & 0.5 \\
        0 & 0.5 & 1
    \end{pmatrix}
\end{displaymath}
and
\begin{displaymath}
    \ten{F}_y = \begin{pmatrix}
        \sin(\pi y) & -\cos(\pi y) \\
        \cos(\pi y) & \sin(\pi y)
    \end{pmatrix}.
\end{displaymath}

We consider the settings:
\begin{itemize}
    \item[2a)] A purely linear relation between $\mat{X}$ and the response with  $\ten{F}_y\equiv y:1\times 1$ and $\t{\mat{\beta}_1} = (1, 0)$ and $\t{\mat{\beta}_2} = (1, 0, 0)$.
    \item[2b)] The ``baseline'' simulation with all parameters as described above.
    \item[2c)] Low rank regression with both $\mat{\beta}_1$ and $\mat{\beta}_2$ of rank $1$,
    \begin{displaymath}
        \mat{\beta}_1 = \begin{pmatrix}
            1 & 0 \\ 1 & 0
        \end{pmatrix}, \qquad \mat{\beta}_2 = \begin{pmatrix}
            0 & 0 \\ 1 & -1 \\ 0 & 0
        \end{pmatrix}.
    \end{displaymath}
    \item[2d)] The original design of the Ising model is a mathematical model of the behavior of Ferromagnetism \cite{Ising1925} in a thermodynamic setting, modeling the interaction effects of elementary magnets (spin up/down relating to $0$ and $1$). The model assumes all elementary magnets to be the same, which translates to all having the same coupling strength (two-way interactions) governed by a single parameter relating to the temperature of the system. Assuming the magnets are organized in a 2D grid represented by matrix-valued $\ten{X}$, their interactions are restricted to direct neighbors. This is modeled by setting the true $\mat{\Omega}_k$s as tri-diagonal matrices with zero diagonal entries and all non-zero entries identical. We set
    \begin{displaymath}
        \mat{\Omega}_1 = \frac{1}{2}\begin{pmatrix}
            0 & 1 \\ 1 & 0
        \end{pmatrix}, \qquad \mat{\Omega}_2 = \begin{pmatrix}
            0 & 1 & 0 \\
            1 & 0 & 1 \\
            0 & 1 & 0
        \end{pmatrix}
    \end{displaymath}
    where $1 / 2$ corresponds to an arbitrary temperature. The mean effect depending on $\ten{F}_y$ can be interpreted as an external magnetic field.
\end{itemize}

\begin{figure}[ht!]
    \centering
    \includegraphics[scale = 1]{plots/sim-ising.pdf}
    \caption{\label{fig:sim-ising}Simulation results for the Ising GMLM. Sample sizes are placed on the $x$-axis and the mean of subspace distance $d(\mat{B}, \hat{\mat{B}})$ over $100$ replications on the $y$-axis.}
\end{figure}

The subspace distances between the estimated and true parameter spaces for sample size $n=100, 200, 300, 500, 750$ are plotted in \cref{fig:sim-ising}. The comparative results are similar across all simulation settings. PCA and HOPCA, both treating the response $\ten{X}$ as continuous, perform poorly. Not much better are LPCA and CLPCA. Similar to PCA and HOPCA, they do not account for the relation with the response, but they are specifically created for binary predictors. MGCCA, which accounts for $y$ in reducing the predictors, outperforms all the PCA variants. TSIR comes next, even though it treats the predictors $\ten{X}$ as continuous. The Ising GMLM model is the best performer in all simulations.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Data Analysis}\label{sec:data-analysis}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
We apply GMLM and competing methods to the EEG data set. The data. which are freely available\footnote{\url{http://kdd.ics.uci.edu/databases/eeg/eeg.data.html} (accessed April 29, 2025)}, were partially described in \cref{sec:introduction}. Each subject was exposed to either a single stimulus (S1) or to two stimuli (S1 and S2) which were pictures of objects chosen from the 1980 Snodgrass and Vanderwart picture set (see data documentation). When two stimuli were shown, they were presented in either a matched condition where S1 was similar to S2 or in a non-matched condition where S1 differed from S2. These different stimulus conditions introduce a third dimension, making the data a 3-tensor with dimension of the third axis $p_3 = 3$. It is common in the analysis of this data set to only consider the S1 stimulus by dropping all but the S1 setting as described in \cref{sec:introduction} in analogy to \cite{LiKimAltman2010,PfeifferForzaniBura2012,DingCook2015,PfeifferKaplaBura2021}. Here we consider both cases, denoted ether the 2D EEG data set (matrix-valued) for the S1 stimulus only in contrast to the 3D EED data set (tensor-valued) incorporating all stimuli scenarios.

SDR methods for regression with matrix-valued predictors were developed by \cite{LiKimAltman2010} (folded SIR), \cite{PfeifferForzaniBura2012} (L(ongitudinal)SIR), \cite{DingCook2014} (dimension folding PCA and PFC), \cite{PfeifferKaplaBura2021} (K-PIR (ls), K-PIR (mle)), and \cite{DingCook2015} (T(ensor)SIR). LSIR is a generalization of the simple SIR nonparametric slicing algorithm \cite{Li1991} to matrix-valued regressors. The K-PIR (ls and mle) algorithm is based on regressing the matrix-valued predictors on the response using a bilinear regression model and can be considered as a continuous parametric modeling extension of LSIR. TSIR generalizes matrix-valued SDR to tensor-valued SDR. In this data set, $p = p_1 p_2 = 16384$ is much larger than $n=122$. To deal with this issue, the predictor data were pre-screened via (2D)$^2$PCA \cite[]{ZhangZhou2005} which reduced the dimension of the predictor matrix to $(p_1, p_2) = (3, 4)$, $(15, 15)$ and $(20, 30)$ \cite[]{PfeifferKaplaBura2021, LiKimAltman2010, DingCook2014, DingCook2015}. \cite{PfeifferKaplaBura2021} also carried out simultaneous dimension reduction and variable selection using the fast POI-C algorithm of \cite{JungEtAl2019} without data pre-processing\footnote{Due to high computational burden, only 10-fold cross-validation was performed for fast POI-C}.

For the 3D EEG version of the data only TSIR and GMLM are directly applicable since the other methods are specifically designed for the matrix case or require significantly bigger sample sizes. Nevertheless, both K-PIR and LSIR are easily generalized from matrix-valued to tensor-valued predictors. In addition, it is possible to apply LSIR to the full data set without pre-screening from an algorithmic point of few, ignoring any theory or assumptions. Only K-PIR can not be applied to the full data set.

In contrast, TSIR and our GMLM model can be applied directly to the raw data without pre-screening or variable selection. In general, the sample size does not need to be large for maximum likelihood estimation in the multilinear normal model. In particular for matrix normal models, \cite{DrtonEtAl2020} proved that very small sample sizes, as little as $3$,\footnote{The required minimum sample size depends on non-trivial number theoretic relations between the mode dimensions, while the vectorized dimensionality has no specific role.} are sufficient to obtain unique MLEs for Kronecker covariance structures.
Among the tensor-focused methods in \cref{sec:simulations}, we herein consider only TSIR in view of its superior performance.

We report results for the best performing methods from \cite{PfeifferKaplaBura2021}, namely K-PIR (ls) and LSIR from \cite{PfeifferForzaniBura2012} for $(p_1, p_2) = (3, 4)$, $(15, 15)$, and $(20, 30)$. In addition, we report results for GMLM and two versions of TSIR. TSIR in its original form \cite[]{DingCook2015} as well as TSIR (reg), where we introduce a similar regularization as the one used in the tensor-normal GMLM algorithm. Regularizing TSIR was deemed appropriate in view of the surprising 2D EEG results for TSIR (see Appendix~D in the supplementary material for a comparison of TSIR versus GMLM leading to TSIR (reg)). We report results from applying  GMLM as well as GMLM (FFT) (Fast Fourier Transform) which assumes that the time-axis of the EEG data contains important underlying frequencies. For this reason, the time-axis reduction is modeled as a time-series mixture containing only the $10$ frequencies of highest magnitude as lower-magnitude ones often correspond to noise or less important variations in the signal. The discriminatory ability of the reduced predictors of the various approaches was assessed by AUC or the area under the receiver operator characteristic (ROC) curve \cite[p. 67]{Pepe2003}. We use leave-one-out cross-validation to obtain unbiased AUC estimates. The average AUC values and their standard deviation for the 2D and 3D EEG data are tabulated in \cref{tab:eeg}.

Across all pre-screening dimensions, K-PIR (ls) has an AUC of $78\%$ in the 2D EEG with worse results in the 3D case with an AUC of $74\%$--$75\%$. This is in stark contrast to all other methods which gained predictive performance from 2D to 3D. LSIR showed excellent performance with high pre-screening, peaking at $85\%$ and $88\%$ in the 2D and 3D setting, respectively. Without pre-screening, LSIR fails, which is expected as it does not support the full data setting where the time-axis dimension exceeds the sample size, regardless of its algorithmic applicability.

In the 2D scenario, TSIR shows equally good performance with high pre-screening at $(4, 3)$ while slowly decreasing in performance down to $80\%$ at $(20, 30)$ and then dropping down to $69\%$ AUC for the full data set. Comparing with the TSIR performance in the 3D case is somewhat surprising as it is quite stable over all pre-screening dimensions as well as on the full data. This inconsistency between the 2D and 3D analysis of TSIR in conjunction with an expectation of TSIR behaving similarly to GMLM in this case steered us to take a closer look in Appendix~D in the supplementary material. We deduced that the main reason for the noted unexpected performance of TSIR is caused by the required inversion of ill-conditioned variance-covariance estimates. To address this limitation, we amended it to TSIR (reg) by introducing regularization of the variance-covariance matrix. TSIR (reg), as can be seen in \cref{tab:eeg}, is very stable with better performance in the 2D case compared to TSIR and a slight decrease in the 3D setting. Our GMLM has identical performance to TSIR (reg) in the 2D setting, as we expected. In the 3D case, GMLM is slightly better than  TSIR (reg) while maintaining stability in performance across the different settings. Finaly, GMLM (FFT), which makes only sense to be applied to the full data, reaches AUC of $85\%$ in 2D and $87\%$ in 3D.

\begin{table}[!hpt]
    \caption{\label{tab:eeg}Mean AUC values and their standard deviation (in parentheses) based on leave-one-out cross-validation for the EEG imaging data}
    \begin{tabular}{l c c c c}
    \hline
            & \multicolumn{4}{c}{2D EEG: $(p_1, p_2)$} \\
    Method  & \multicolumn{1}{c}{$(3, 4)$}
            & \multicolumn{1}{c}{$(15, 15)$}
            & \multicolumn{1}{c}{$(20, 30)$}
            & \multicolumn{1}{c}{$(256, 64)$} \\
    \hline
    K-PIR (ls) & 78\% (4\%) & 78\% (4\%) & 78\% (4\%) & \\
    LSIR       & 85\% (4\%) & 81\% (4\%) & 83\% (4\%) & 39\% (5\%) \\
    TSIR       & 85\% (4\%) & 83\% (4\%) & 80\% (4\%) & 69\% (5\%) \\
    TSIR (reg) & 85\% (4\%) & 84\% (4\%) & 84\% (4\%) & 84\% (4\%) \\
    GMLM       & 85\% (4\%) & 84\% (4\%) & 84\% (4\%) & 84\% (4\%) \\
    GMLM (FFT) & & & & 85\% (4\%) \\
    \hline
            & \multicolumn{4}{c}{3D EEG: $(p_1, p_2, p_3)$} \\
    Method  & \multicolumn{1}{c}{$(3, 4, 3)$}
            & \multicolumn{1}{c}{$(15, 15, 3)$}
            & \multicolumn{1}{c}{$(20, 30, 3)$}
            & \multicolumn{1}{c}{$(256, 64, 3)$} \\
    \hline
    K-PIR (ls) & 74\% (5\%) & 75\% (4\%) & 75\% (4\%) \\
    LSIR       & 88\% (3\%) & 81\% (4\%) & 71\% (5\%) & 52\% (6\%) \\
    TSIR       & 87\% (3\%) & 89\% (3\%) & 88\% (3\%) & 86\% (3\%) \\
    TSIR (reg) & 87\% (3\%) & 84\% (4\%) & 84\% (4\%) & 84\% (4\%) \\
    GMLM       & 88\% (3\%) & 87\% (3\%) & 87\% (3\%) & 87\% (3\%) \\
    GMLM (FFT) & & & & 87\% (3\%) \\
    \hline
    \end{tabular}
\end{table}

As a real data application on regressions with binary tensor-valued predictors, we analyze chess data and perform a proof of concept data analysis where a chess board is interpreted as a collection of binary $8\times 8$ matrices in Appendix~F of the supplementary material.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Discussion}\label{sec:discussion}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
In this paper, we propose a generalized multi-linear model formulation for the inverse conditional distribution of a tensor-valued predictor given a response and derive a multi-linear sufficient reduction for the corresponding forward regression/classification problem. We also propose estimators for the sufficient reduction and show they are consistent and asymptotically normal. We demonstrate, through a numerical example in supplementary material Appendix~C, the modeling benefits of leveraging the tensor structure of the data.

Obtaining the asymptotic results required leveraging manifolds as a basis for resolving the issue of unidentifiable parameters. This in turn led to an even more flexible modeling framework, which allows building complex and potentially problem-specific parameter spaces that incorporate additional domain-specific knowledge into the model.
We allude to this feature of our approach in \cref{sec:matrix-manifolds}, where we also tabulate different matrix manifolds that can be used as building blocks $\manifold{B}_k$ and $\manifold{O}_k$ of the parameter space in \cref{tab:matrix-manifolds}. For example, our formulation can easily accommodate longitudinal data tabulated in matrix format, where the rows are covariates and the columns are consecutive time points with discrete AR($k$) dependence structure.

Our multi-linear Ising model can be thought of as the extension of the Ising model-based approach of  \cite{ChengEtAl2014}, where a $q$-dimensional binary vector is regressed on a $p$-dimensional continuous vector. Yet, our model leverages the inherent structural information of the tensor-valued covariates by assuming separable first and second moments. By doing so, it bypasses requiring sparsity assumptions or penalization, despite the tensor high-dimensional nature of the data. Moreover, it can accommodate a mixture of continuous and binary tensor-valued predictors, which is the subject of future work.

A special case of the Ising model is the one-parameter Ising model without an external field. Given our theory, it is possible to represent this special case as well by constructing a one-dimensional parameter manifold. More precisely, if $\mat{A}$ is a fixed symmetric $p\times p$ matrix with zero main diagonal elements,  the parameter space for the one-dimensional Ising model has the form $\Theta = \{\mat{0}\}\times \{ \theta\mat{A} + 0.5\mat{I}_p : \theta\in\mathbb{R} \}$. As a result (a) the model is a vector-valued multi-variate binary model for predictors $\mat{X}\in\{0, 1\}^p$, and (b) the predictors $\mat{X}$ are independent of the response. In the unsupervised setting, the usual question is to determine the single parameter $\theta$. This is the model considered by \cite{XuMukherjee2023}, whose finding that the asymptotic distribution of the MLE depends on the true value of the parameter and is not necessarily normal, at first glance, may seem contradictory to our asymptotic result that the MLE $\hat{\theta}_n$ is asymptotically normal. \cite{XuMukherjee2023} answer a different question in that they determine the limiting experiment distribution \cite[][Ch~9]{vanderVaart1998},  where they let the dimension $p$ of the binary vector-valued predictors  go to infinity ($p\to\infty$). We, on the other hand, let the sample size; i.e., the number of observations go to infinity ($n\to\infty$).

An additional powerful extension of our model involves considering a sum of separable Kronecker predictors. This is motivated by the equivalence of a Kronecker product to a rank $1$ tensor. By allowing a sum of a few separable Kronecker predictors, we remove the implicit rank $1$ constraint. However, if this extension is to be applied to the SDR setting, as in this paper, it is crucial to ensure that the sum of Kronecker products form a parameter manifold.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Support information, if any,             %%
%% should be provided in the                %%
%% Acknowledgements section.                %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{acks}[Acknowledgments]
    We would like to thank the Editor, the Associate Editor and the anonymous referees for their careful reading of the paper and suggestions that improved the quality of this paper.
\end{acks}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Funding information, if any,             %%
%% should be provided in the                %%
%% funding section.                         %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{funding}
    Both authors were partially supported by the Vienna Science and Technology Fund (WWTF) [10.47379/ICT19018] and the Austrian Science Fund (FWF) research project P 30690-N35.
\end{funding}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Supplementary Material, including data   %%
%% sets and code, should be provided in     %%
%% {supplement} environment with title      %%
%% and short description. It cannot be      %%
%% available exclusively as external link.  %%
%% All Supplementary Material must be       %%
%% available to the reader on Project       %%
%% Euclid with the published article.       %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{supplement}
\stitle{Supplement to ``Generalized Multilinear Models for Sufficient Dimension Reduction on Tensor-valued Predictors''}
\sdescription{The supplement \cite{KaplaBura2025sup} consists of the Appendices to the main paper. An example for disambiguating vectorization and matricization is in Appendix~A. Appendix~B contains the proofs omitted in the paper and the regularity conditions required in \cref{thm:asymptotic-normality-gmlm}. In Appendix~C we provide a numerical example of the advantages of tensor versus vector representation. An investigation into the behavior of TSIR versus GMLM applied to the EEG data including the description of TSIR (reg) is found in Appendix~D. Technical details for the parameter estimation of the multi-linear Ising model are given in Appendix~E. Appendix~F is a proof of concept application of the Ising GMLM model to chess.}
\end{supplement}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%                  The Bibliography                       %%
%%                                                         %%
%%  imsart-???.bst  will be used to                        %%
%%  create a .BBL file for submission.                     %%
%%                                                         %%
%%  Note that the displayed Bibliography will not          %%
%%  necessarily be rendered by Latex exactly as specified  %%
%%  in the online Instructions for Authors.                %%
%%                                                         %%
%%  MR numbers will be added by VTeX.                      %%
%%                                                         %%
%%  Use \cite{...} to cite references in text.             %%
%%                                                         %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\bibliographystyle{imsart-nameyear} % Style BST file (imsart-number.bst or imsart-nameyear.bst)
\bibliography{references}           % Bibliography file (usually '*.bib')

\end{document}