Ising introduction part and effie

2024-04-12 14:02:58 +02:00 · 2024-04-12 14:02:58 +02:00 · 8fb2fd057d
commit 8fb2fd057d
parent 7a68948d26
2 changed files with 136 additions and 37 deletions
--- a/LaTeX/main.bib
+++ b/LaTeX/main.bib
@ -1,21 +1,3 @@
@misc{MukherjeeEtAl2020,
      title={Estimation in Tensor Ising Models}, 
      author={Somabha Mukherjee and Jaesung Son and Bhaswar B. Bhattacharya},
      year={2020},
      eprint={2008.12882},
      archivePrefix={arXiv},
      primaryClass={math.ST}
 }
@misc{LiuEtAl2023,
      title={Tensor Recovery in High-Dimensional Ising Models}, 
      author={Tianyu Liu and Somabha Mukherjee and Rahul Biswas},
      year={2023},
      eprint={2304.00530},
      archivePrefix={arXiv},
      primaryClass={math.ST}
 }
@book{AbadirMagnus2005,
    author     = {Abadir, Karim M. and Magnus, Jan R.},
    collection = {Econometric Exercises},
@ -280,6 +262,18 @@
    year    = {2010}
 }
@article{Bury2013,
    author  = {Thomas Bury},
    doi     = {10.1016/j.physa.2012.10.046},
    issn    = {0378-4371},
    journal = {Physica A: Statistical Mechanics and its Applications},
    number  = {6},
    pages   = {1375-1385},
    title   = {Market structure explained by pairwise interactions},
    volume  = {392},
    year    = {2013}
 }
@article{CandesEtAl2008,
    author   = {Cand\`es, Emmanuel J. and Wakin, Michael B. and Boyd, Stephen P.},
    doi      = {10.1007/s00041-008-9045-x},
@ -608,6 +602,20 @@
    year      = {1991}
 }
@article{CoxWermuth1994,
    author    = {D. R. Cox and Nanny Wermuth},
    issn      = {00063444},
    journal   = {Biometrika},
    number    = {2},
    pages     = {403--408},
    publisher = {[Oxford University Press, Biometrika Trust]},
    title     = {A Note on the Quadratic Exponential Binary Distribution},
    url       = {http://www.jstor.org/stable/2336971},
    urldate   = {2024-04-11},
    volume    = {81},
    year      = {1994}
 }
@book{Dai2012,
    author    = {Dai, Bin},
    isbn      = {978-1267-53750-8},
@ -871,6 +879,18 @@
    year     = {2022}
 }
@inproceedings{FischerIgel2012,
    address   = {Berlin, Heidelberg},
    author    = {Fischer, Asja and Igel, Christian},
    booktitle = {Progress in Pattern Recognition, Image Analysis, Computer Vision, and Applications},
    editor    = {Alvarez, Luis and Mejail, Marta and Gomez, Luis and Jacobo, Julio},
    isbn      = {978-3-642-33275-3},
    pages     = {14--36},
    publisher = {Springer Berlin Heidelberg},
    title     = {An Introduction to Restricted Boltzmann Machines},
    year      = {2012}
 }
@article{Fisher1922,
    author    = {R. A. Fisher},
    issn      = {02643952},
@ -998,6 +1018,20 @@
    year      = {1997}
 }
@article{Habeck2014,
    author    = {Habeck, Michael},
    doi       = {10.1103/PhysRevE.89.052113},
    issue     = {5},
    journal   = {Phys. Rev. E},
    month     = {May},
    numpages  = {7},
    pages     = {052113},
    publisher = {American Physical Society},
    title     = {Bayesian approach to inverse statistical mechanics},
    volume    = {89},
    year      = {2014}
 }
@misc{HajriEtAl2017,
    author    = {Hajri, Hatem and Said, Salem and Berthoumieu, Yannick},
    doi       = {10.1007/978-3-319-68445-1_80},
@ -1070,6 +1104,19 @@
    year       = {2013}
 }
@article{Hinton2002,
    author  = {Hinton, Geoffrey E.},
    doi     = {10.1162/089976602760128018},
    issn    = {0899-7667},
    journal = {Neural Computation},
    month   = {08},
    number  = {8},
    pages   = {1771--1800},
    title   = {{Training Products of Experts by Minimizing Contrastive Divergence}},
    volume  = {14},
    year    = {2002}
 }
@misc{Hinton2012,
    author = {Hinton, Geoffrey E.},
    note   = {Coursera Lecture 6 - Online; accessed Jan 18, 2024},
@ -1532,6 +1579,15 @@
    year    = {2019}
 }
@misc{LiuEtAl2023,
    archiveprefix = {arXiv},
    author        = {Tianyu Liu and Somabha Mukherjee and Rahul Biswas},
    eprint        = {2304.00530},
    primaryclass  = {math.ST},
    title         = {Tensor Recovery in High-Dimensional Ising Models},
    year          = {2023}
 }
@inbook{LiuKoike2007,
    author    = {Chunxue Liu and Katsuaki Koike},
    doi       = {10.1007/s11004-007-9085-9},
@ -1735,6 +1791,15 @@
    year      = {1943}
 }
@misc{MukherjeeEtAl2020,
    archiveprefix = {arXiv},
    author        = {Somabha Mukherjee and Jaesung Son and Bhaswar B. Bhattacharya},
    eprint        = {2008.12882},
    primaryclass  = {math.ST},
    title         = {Estimation in Tensor Ising Models},
    year          = {2020}
 }
@article{Nadarajah2005,
    author    = {Saralees Nadarajah},
    doi       = {10.1080/02664760500079464},
@ -1848,7 +1913,6 @@
    year    = {2021}
 }
@inproceedings{RabusseauKadri2016,
    author    = {Rabusseau, Guillaume and Kadri, Hachem},
    booktitle = {Advances in Neural Information Processing Systems},
@ -1872,6 +1936,7 @@
    year    = {1958}
 }
@inproceedings{Rumelhart1986,
    author = {David E. Rumelhart and Geoffrey E. Hinton and Ronald J. Williams},
    title  = {Learning internal representations by error propagation},
@ -1893,6 +1958,20 @@
    year      = {1994}
 }
@article{SchneidmanEtAl2006,
    author  = {Schneidman, Elad and Berry, Michael J. and Segev, Ronen and Bialek, William},
    day     = {01},
    doi     = {10.1038/nature04701},
    issn    = {1476-4687},
    journal = {Nature},
    month   = {Apr},
    number  = {7087},
    pages   = {1007-1012},
    title   = {Weak pairwise correlations imply strongly correlated network states in a neural population},
    volume  = {440},
    year    = {2006}
 }
@inproceedings{ShanEtAl2008,
    author    = {Shiguang Shan and Bo Cao and Yu Su and Laiyun Qing and Xilin Chen and Wen Gao},
    booktitle = {2008 IEEE Conference on Computer Vision and Pattern Recognition},
@ -1920,6 +1999,13 @@
    year      = {2005}
 }
@inproceedings{Smolensky1986,
    author = {Paul Smolensky},
    title  = {Information Processing in Dynamical Systems: Foundations of Harmony Theory},
    url    = {https://stanford.edu/~jlmcc/papers/PDP/Volume%201/Chap6_PDP86.pdf},
    year   = {1986}
 }
@article{Soize2008,
    author  = {C. Soize},
    doi     = {10.1016/j.probengmech.2007.12.019},
--- a/LaTeX/paper.tex
+++ b/LaTeX/paper.tex
@ -87,7 +87,7 @@
 \newtheorem{remark}{Remark}
-\crefalias{section}{appendix}
+% \crefalias{section}{appendix}
 \crefname{condition}{Condition}{Conditions}
 \Crefname{condition}{Condition}{Conditions}
@ -272,7 +272,7 @@
 \maketitle
 \begin{abstract}
-    We consider regression or classification problems where the independent variable is matrix- or tensor-valued. We derive a multi-linear sufficient reduction for the regression or classification problem modeling the conditional distribution of the predictors given the response as a member of the quadratic exponential family. Using manifold theory, we prove the consistency and asymptotic normality of the sufficient reduction. We develop estimation procedures of
+    We consider supervised learning (regression/classification) problems where the independent variable is tensor-valued. We derive a multi-linear sufficient reduction for the regression or classification problem modeling the conditional distribution of the predictors given the response as a member of the quadratic exponential family. Using manifold theory, we prove the consistency and asymptotic normality of the sufficient reduction. We develop estimation procedures of
    sufficient reductions for both continuous and binary tensor-valued predictors. For continuous predictors, the algorithm is highly computationally efficient and is also applicable to situations where the dimension of
    the reduction exceeds the sample size. We demonstrate the superior performance of our approach in simulations and real-world data examples for both continuous and binary tensor-valued predictors. The \textit{Chess data} analysis results agree with a human player's understanding of the game and confirm the relevance of our approach. 
 \end{abstract}
@ -303,25 +303,36 @@ Complex data are collected at different times and/or under several conditions of
 Tensor regression models have been proposed to leverage the structure inherent in tensor valued data. For instance, \textcite{HaoEtAl2021,ZhouLiZhu2013} focus on tensor covariates, while \textcite{RabusseauKadri2016,LiZhang2017,ZhouLiZhu2013} focus on tensor responses, and \textcite{Hoff2015,Lock2018} consider  tensor on tensor regression. \textcite{HaoEtAl2021} modeled a scalar response as a flexible nonparametric function of tensor covariates. \textcite{ZhouLiZhu2013} assume the scalar response has a distribution in the exponential family given the tensor-valued predictors with the link modeled as a multilinear function of the predictors. \textcite{RabusseauKadri2016} model the tensor-valued response as a linear model with tensor valued regression coefficients subject to a multilinear rank constraint. \textcite{LiZhang2017} approach the problem with a similar linear model but instead of a low rank constraint the error term is assumed to have a separable Kronecker product structure while using a generalization of the envelope model \parencite{CookLiChiaromonte2010}. \textcite{ZhouEtAl2023} focus on partially observed tensor response given vector-valued predictors with mode-wise sparsity constraints in the regression coefficients. \textcite{Hoff2015} extends an existing bilinear regression model to a tensor on tensor of conformable modes and dimensions regression model based on a Tucker product. \textcite{Lock2018} uses a tensor contraction to build a penalized least squares model for a tensor with arbitrary number of modes and dimensions.
 Our approach considers the general regression problem of fitting a response of general form (univariate, multivariate, tensor-valued) on a tensor-value predictor. We operate in the context of sufficient dimension reduction \parencite[e.g.]{Cook1998,Li2018} based on inverse regression, which leads us to regressing the tensor-valued predictor on the response. In our setting, this necessitates transforming the response to tensor-valued functions, regardless of whether it is itself tensor-valued. Because of the setting, our method shares commonalities with the tensor regression models referred to above, yet the modeling and methodology are novel. 
-Specifically, our tensor-to-tensor regression model is a generalized multi-linear model similar to the generalized linear model of \cite{ZhouLiZhu2013}. % but with tensor valued response by applying (a known) tensor valued function to the response in an inverse regression setting, reversing the role of response and predictors. 
+Specifically, our tensor-to-tensor regression model is a generalized multi-linear model similar to the generalized linear model of \parencite{ZhouLiZhu2013}. % but with tensor valued response by applying (a known) tensor valued function to the response in an inverse regression setting, reversing the role of response and predictors. 
-To bypass the explosion of number of parameters to estimate, we assume the inverse regression error covariance has Kronecker product structure as do \textcite{LiZhang2017}. Our maximum likelihood-based estimation does not require any penalty terms in contrast to the least squares and/or sparse approaches \cite{????} . In the case of a tensor (multilinear) normal, given the tensor-valued function of the response, our model exhibits similarities to the multilinear modeling of \textcite{Hoff2015}, but we use a generalized multilinear model and estimate the parameters with maximum likelihood instead of least squares. Moreover, a common issue in multilinear tensor regression models is the unidentifiability of the parameters, which we address in a completely different manner. For example,  \cite{LiZhang2017} develop theory that is based on orthogonal projection matrices to uniquely identify a subspace, while our approach is more general as it uses manifold theory. 
+To bypass the explosion of the number of parameters to estimate, we assume the inverse regression error covariance has Kronecker product structure as do \textcite{LiZhang2017}. Our maximum likelihood-based estimation does not require any penalty terms in contrast to the least squares and/or sparse approaches \parencite{ZhouLiZhu2013}. In the case of a tensor (multilinear) normal, given the tensor-valued function of the response, our model exhibits similarities to the multilinear modeling of \textcite{Hoff2015}, but we use a generalized multilinear model and estimate the parameters with maximum likelihood instead of least squares. Moreover, a common issue in multilinear tensor regression models is the unidentifiability of the parameters, which we address in a completely different manner. For example, \textcite{LiZhang2017} develop theory that is based on orthogonal projection matrices to uniquely identify a subspace, while our approach is more general as it uses manifold theory. 
-In this paper we present a model-based \emph{Sufficient Dimension Reduction} (SDR) method for tensor-valued data with distribution in the quadratic exponential family assuming a separable Kronecker product structure of the first and second moment. By generalizing the parameter space to embedded manifolds we obtain consistency and asymptotic normality results while allowing great modeling flexibility in the linear sufficient dimension reduction.
+In this paper, we present a model-based \emph{Sufficient Dimension Reduction} (SDR) method for tensor-valued data with distribution in the quadratic exponential family assuming a separable Kronecker product structure of the first and second moment. By generalizing the parameter space to embedded manifolds we obtain consistency and asymptotic normality results while allowing great modeling flexibility in the linear sufficient dimension reduction.
 The quadratic exponential family contains the tensor normal and the tensor Ising distributions, for continuous and binary tensor-valued random variables, respectively. 
-Multilinear tensor normal models have been used in various applications, including medical imaging \parencite{BasserPajevic2007,DrydenEtAl2009}, spatio-temporal data analysis \parencite{GreenewaldHero2014}, regression analysis for longitudinal relational data \parencite{Hoff2015}. One of the most important uses of the multilinear normal (MLN) distribution, and hence tensor analysis, is perhaps in magnetic resonance imaging (MRI) \parencite{OhlsonEtAl2013}. A recent survey \parencite{WangEtAl2022} and references therein contain more information and potential applications of multilinear tensor normal models.
+Multi-linear normal models have been used in various applications, including medical imaging \parencite{BasserPajevic2007,DrydenEtAl2009}, spatio-temporal data analysis \parencite{GreenewaldHero2014}, regression analysis for longitudinal relational data \parencite{Hoff2015}. One of the most important uses of the multi-linear normal (MLN) distribution, and hence tensor analysis, is perhaps in magnetic resonance imaging (MRI) \parencite{OhlsonEtAl2013}. A recent survey \parencite{WangEtAl2022} and references therein contain more information and potential applications of multilinear tensor normal models.
-The Ising model for multivariate binary outcomes belongs to the class of discrete exponential families. Its defining feature is that the sufficient statistic involves a quadratic term to capture correlations arising from pairwise interactions.
+The Ising\footnote{Also known as the \emph{Lenz-Ising} model as the physical assumptions of the model where developed by both Lenz and Ising \parencite{Niss2005} where Ising gave a closed form solution for the 1D lattice \parencite{Ising1925}.} model \parencite{Lenz1920,Ising1925,Niss2005} is a mathematical model originating in statistical physics to study ferromagnetism in a thermodynamic setting. It describes magnetic dipoles (atomic ``spins'' with values $\pm 1$) under an external magnetic field (first moments) while allowing two-way interactions (second moments) between direct neighbours on a lattice, a discrete grid. The Ising problem, as known in statistical physics, is to compute observables such as the magnetizations and correlations under the Boltzmann distribution\footnote{The Boltzmann distribution is a probability distribution over the states of a physical system in thermal equilibrium (constant temperature) that assigns higher probabilities to states with lower energy.} while the interaction structure and the magnetic fields are given. The ``reverse'' problem, where the couplings and fields are unknown and to be determined from observations of the spins, as in statistical inference, is known as the \emph{inverse Ising problem} \parencite{NguyenEtAl2017}. From this point of view, the Ising model is a member of a discrete quadratic exponential family \parencite{CoxWermuth1994,JohnsonEtAl1997} for multivariate binary outcomes where the interaction structure (non-zero correlations) is determined by the lattice. Generally, neither the values of couplings nor the interaction structure are known.
 The tensor Ising model is a higher-order Ising model for tensor-valued binary outcomes. 
 %From \cite{MukherjeeEtAl2020}
 Higher-order Ising models arise naturally in the study of multi-atom interactions in lattice gas models, such as the square-lattice eight-vertex model, the Ashkin-Teller model, and Suzuki's pseudo-3D anisotropic model (cf. [6, 33, 36, 37, 49, 55, 56, 61, 62] and the references therein). More recently, higher-order spin systems have also been proposed for modeling peer-group effects in social networks [22]. \textcite{MukherjeeEtAl2020} proposed a maximum pseudo-likelihood estimation algorithm for a one-parameter tensor-Ising model. \efi{Daniel: comment on what these guys do and contrast with your setting} In our approach, the parameter is not constrained to be scalar
 We derive maximum likelihood estimates for all first and second order interactions and propose a gradient-based optimization algorithm. 
-Our results in the framework of the quadratic exponential family for tensor-valued variables; i.e., consistency and asymptotic normality, apply to both tensor normal and tensor Ising models. 
+In consequence, the Ising model is mostly used to model multivariate binary data in statistics. The states are ${0, 1}$ instead of $\pm 1$, and full interaction structure. It is related to a multitude of other models, among which the most prominent are: \emph{Graphical Models} and \emph{Markov Random Fields} to describe conditional dependence \parencite{Lauritzen1996,WainwrightJordan2008,LauritzenRichardson2002}, \emph{Potts models} \parencite{Besag1974,ChakrabortyEtAl2022} which generalize the Ising model to multiple states, the \emph{multivariate Bernoulli distribution} \parencite{Whittaker1990,JohnsonEtAl1997,DaiDingWahba2013} that also accommodates higher-order interactions (three-way and higher), \emph{(restricted) Botlzmann machines} \parencite{Smolensky1986,Hinton2002,FischerIgel2012} that introduce additional hidden variables for learning binary distributions. Most of these models can be used both in supervised and unsupervised settings. 
 Applications of the Ising model (and variations thereof) range from modeling neural firing patterns \parencite{SchneidmanEtAl2006}, gene expression data analysis \parencite{LezonEtAl2006}, and modeling financial markets \parencite{Bury2013}. See also \textcite{NguyenEtAl2017}.
-The structure of this paper is as follows. In \cref{sec:notation} we introduce our notation. \Cref{sec:problem-formulation} decribes the exact problem and in \cref{sec:gmlm-model} we introduce our model. Continuing in \cref{sec:ml-estimation} we provide the basis for a general maximum likelihood estimation procedure and derive specialized methods for tensor normal as well as the tensor Ising distributions. \Cref{sec:manifolds} gives a short introduction into manifolds and provides the basis for applying the consistency and asymtotic normality results from \cref{sec:asymtotics}. Simulations for continuous and binary predicotrs are subject of \cref{sec:simulations}. Finally, in \cref{sec:data-analysis} we apply our model to EEG data and perform a prove of concept data analysis example where a chess board is interprated as a collection of binary $8\times 8$ matrices.
+The $r$-tensor Ising model in statistical physics is a generalization of the Ising model to $r$-order interactions. \textcite{MukherjeeEtAl2020} study the one-parameter discrete exponential family for modeling dependent binary data where the interaction structure is given. In \textcite{LiuEtAl2023} the tensor structure itself is to be inferred. These models are fundamentally different from our approach where we rely on properties of the quadratic exponential family which models up to second-order interactions. Another important difference is that we adopt the multi-linear formulation as it is inherently linked to the observable structure of multi-way data as opposed to describing the model coefficients with an $r$-order tensor structure.  
 % \textcite{LiuEtAl2023,MukherjeeEtAl2020,ChengEtAl2014,Habeck2014}
 % The Ising model for multivariate binary outcomes belongs to the class of discrete exponential families. Its defining feature is that the sufficient statistic involves a quadratic term to capture correlations arising from pairwise interactions.
 % The tensor Ising model is a higher-order Ising model for tensor-valued binary outcomes. 
 % %From \cite{MukherjeeEtAl2020}
 % Higher-order Ising models arise naturally in the study of multi-atom interactions in lattice gas models, such as the square-lattice eight-vertex model, the Ashkin-Teller model, and Suzuki's pseudo-3D anisotropic model (cf. [6, 33, 36, 37, 49, 55, 56, 61, 62] and the references therein). More recently, higher-order spin systems have also been proposed for modeling peer-group effects in social networks [22].  \efi{Daniel: comment on what these guys do and contrast with your setting} In our approach, the parameter is not constrained to be scalar
 % We derive maximum likelihood estimates for all first and second order interactions and propose a gradient-based optimization algorithm. 
 As an aside, even though our motivation stems from the SDR perspective, our proposal concerns inference on any regression model with a tensor-valued response and any type of predictors. Thus, our approach can be used as a stand-alone model for such data regardless of whether one is interested in deriving sufficient reductions and/or reducing the dimension of the data.  Our results in the framework of the quadratic exponential family for tensor-valued variables; i.e., consistency and asymptotic normality, apply to both multi-linear normal \efi{\ref{?} and multi-linear Ising models, as defined in this paper in Sec. ??.} 
 The structure of this paper is as follows. We introduce our notation in \cref{sec:notation}. \Cref{sec:problem-formulation} details the problem we consider and in \cref{sec:gmlm-model} we introduce our model. Continuing in \cref{sec:ml-estimation} we provide the basis for a general maximum likelihood estimation procedure and derive specialized methods for tensor normal as well as the tensor Ising distributions. \Cref{sec:manifolds} gives a short introduction into manifolds and provides the basis for applying the consistency and asymtotic normality results from \cref{sec:asymtotics}. Simulations for continuous and binary predictors are the subject of \cref{sec:simulations}. Finally, in \cref{sec:data-analysis} we apply our model to EEG data and perform a prove of concept data analysis example where a chess board is interpreted as a collection of binary $8\times 8$ matrices.
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \section{Notation}\label{sec:notation}
@ -395,13 +406,13 @@ To find such a reduction $\ten{R}$, we leverage the equivalence relation pointed
 \end{equation}
 According to \eqref{eq:inverse-regression-sdr}, a \textit{sufficient statistic} $\ten{R}(\ten{X})$ for $Y$ in the inverse regression $\ten{X}\mid Y$, where $Y$ is considered as a parameter indexing the model, is also a \textit{sufficient reduction} for $\ten{X}$ in the forward regression $Y\mid\ten{X}$. %The equivalent inverse regression in \eqref{eq:inverse-regression-sdr} provides exhaustive characterization of $\ten{R}(\ten{X})$. 
-The factorization theorem  is the usual tool to identify sufficient statistics and requires a distributional model. In this paper, we assume the distribution of $\ten{X}\mid Y$ belongs to  the \emph{quadratic exponential family} in order to (a) simplify modeling and (b) keep estimation feasible.  We assume that $\ten{X}\mid Y$ is a full rank quadratic exponential family with density
+The factorization theorem is the usual tool to identify sufficient statistics and requires a distributional model. In this paper, we assume the distribution of $\ten{X}\mid Y$ belongs to the \emph{quadratic exponential family} in order to (a) simplify modeling and (b) keep estimation feasible.  We assume that $\ten{X}\mid Y$ is a full rank quadratic exponential family with density
 \begin{align}
 f_{\mat{\eta}_y}(\ten{X}\mid Y = y)
    &= h(\ten{X})\exp(\t{\mat{\eta}_y}\mat{t}(\ten{X}) - b(\mat{\eta}_y)) \nonumber \\
    &= h(\ten{X})\exp(\langle \mat{t}_1(\ten{X}), \mat{\eta}_{1y} \rangle + \langle \mat{t}_2(\ten{X}), \mat{\eta}_{2y} \rangle - b(\mat{\eta}_{y})) \label{eq:quad-density}
 \end{align}
-where $\mat{t}_1(\ten{X})=\vec \ten{X}$ and $\mat{t}_2(\ten{X})$ is linear in $\ten{X}\circ\ten{X}$. The dependence of $\ten{X}$ on $Y$ is fully captured in the natural parameter $\mat{\eta}_y$. The function $h$ is non-negative real-valued and $b$ is assumed to be at least twice continuously differentiable and strictly convex. An important feature of the \emph{quadratic exponential family} is that the distribution of its members is fully characterized by their first two moments. Distributions within the quadratic exponential family include the \emph{tensor normal} (\cref{sec:tensor-normal-estimation}) and \emph{tensor Ising model} (\cref{sec:ising_estimation}, a generalization of the (inverse)\footnote{\todo{}} Ising model which is multi-variate Bernoulli with up to second order interactions) and mixtures of these two. 
+where $\mat{t}_1(\ten{X})=\vec \ten{X}$ and $\mat{t}_2(\ten{X})$ is linear in $\ten{X}\circ\ten{X}$. The dependence of $\ten{X}$ on $Y$ is fully captured in the natural parameter $\mat{\eta}_y$. The function $h$ is non-negative real-valued and $b$ is assumed to be at least twice continuously differentiable and strictly convex. An important feature of the \emph{quadratic exponential family} is that the distribution of its members is fully characterized by their first two moments. Distributions within the quadratic exponential family include the \emph{tensor normal} (\cref{sec:tensor-normal-estimation}) and \emph{tensor Ising model} (\cref{sec:ising_estimation}, a generalization of the (inverse) Ising model which is a multi-variate Bernoulli with up to second order interactions) and mixtures of these two. 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \section{The Generalized Multi-Linear Model}\label{sec:gmlm-model}
@ -530,7 +541,7 @@ An iterative cyclic updating scheme is derived in \cref{sec:tensor-normal-estima
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{Tensor Normal}\label{sec:tensor-normal-estimation}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-The tensor normal, also known as the \emph{multilinear normal}, is the extension of the matrix normal to tensor-valued random variables and a member of the quadratic exponential family \eqref{eq:quadratic-exp-fam} under \eqref{eq:eta2}. \textcite{Dawid1981,Arnold1981} introduced the term matrix normal and, in particular, \textcite{Arnold1981} provided several theoretical results, such as its density, moments and conditional distributions of its components. The matrix normal distribution is  a bilinear normal distribution; a distribution of a two-way array, each component
+The \emph{multi-linear normal} is the extension of the matrix normal to tensor-valued random variables and a member of the quadratic exponential family \eqref{eq:quadratic-exp-fam} under \eqref{eq:eta2}. \textcite{Dawid1981,Arnold1981} introduced the term matrix normal and, in particular, \textcite{Arnold1981} provided several theoretical results, such as its density, moments and conditional distributions of its components. The matrix normal distribution is  a bilinear normal distribution; a distribution of a two-way array, each component
 representing a vector of observations \parencite{OhlsonEtAl2013}. \textcite{KolloVonRosen2005,Hoff2011,OhlsonEtAl2013} presented the extension of the bilinear to the multilinear normal distribution, what we call tensor normal, using a parallel extension of bilinear matrices to multilinear tensors \parencite{Comon2009}.
 The defining feature of the matrix normal distribution, and its tensor extension, is the Kronecker product structure of its covariance. This formulation, where the covariates are multivariate normal with multiway covariance structure modeled as a Kronecker product of matrices of much lower dimension, aims to overcome the significant modeling and computational challenges arising from the high computational complexity of manipulating tensor representations \parencite[see, e.g.,][]{HillarLim2013,WangEtAl2022}. 
@ -1147,6 +1158,8 @@ The results of our analysis in the previous paragraph agree with the configurati
 \section{Discussion}
 We have addressed sufficient dimension reduction for tensor valued predictors for regression or classification problems. Proposing a generalized multilinear model modeling the inverse conditional distribution we provided a multilinear sufficient reduction with consistent and asymptotic normal parameters. Moreover, our ansatz for proving the asymptotic results required by leveraging manifolds as a basis for resolving the issue of unidentifiable parameters lead to an even more flexible modeling framework. This allows to build complex and potentially problem specific parameter spaces incorporating additional domain specific knownledge into the model.
 Our multi-linear Ising model can be thought of as the extension of the Ising model-based approach of  \textcite{ChengEtAl2014}, where a $q$-dimensional binary vector is regressed on a $p$-dimensional continuous vector. Yet, our model does not require penalization or sparsity assumptions, despite the tensor nature of the data, by leveraging the inherent structural information of the tensor-valued covariates assuming separable first and second moments. Moreover, it can accommodate a mixture of continuous and binary tensor-valued predictors, which is a subject of future work.  
 An additional powerful extension of our model involves considering a sum of separable Kronecker predictors. This is motivated by the equivalence of a Kronecker product to a rank 1 tensor. By allowing a sum of a few separable Kronecker predictors, we remove the implicit rank 1 constraint. However, if this extension is to be applied to the SDR setting, as in this paper, it is crucial to ensure that the sum of Kronecker products forms a parameter manifold to apply our theory. While we anticipate that this approach can lead to intriguing and powerful models, there are certain details that need to be resolved first.
 \todo{finish!}