


@ 98,6 +98,7 @@





\newcommand{\ternary}[3]{{#2}{\ \mathrm{if}\ }{#1}{\ \mathrm{else}\ }{#3}}










% Define math macros





\renewcommand{\hat}{\widehat}





% \newcommand*{\ten}[1]{\mathcal{#1}}





\DeclareMathOperator{\sym}{sym}





\renewcommand*{\vec}{\operatorname{vec}}




@ 142,9 +143,10 @@





\newcommand{\SkewSymMat}[1]{\mathrm{Skew}^{{#1}\times {#1}}}





\newcommand{\SymPosDefMat}[1]{\mathrm{Sym}_{++}^{{#1}\times {#1}}}





\newcommand{\SymDiagZeroMat}[1]{\mathrm{Sym}_{\diag=0}^{p\times p}}





\newcommand{\SymBand}[2]{\mathrm{SymBand}^{{#1}\times {#1}}_{#2}}










\newcommand{\todo}[1]{\text{\color{red}TODO: #1}}





\newcommand{\efi}[1]{\text{\color{blue}Effie: #1}}





\newcommand{\todo}[1]{{\color{red}TODO: #1}}





\newcommand{\efi}[1]{{\color{blue}Effie: #1}}





% \newcommand{\todo}[1]{}





% \newcommand{\efi}[1]{}









@ 444,9 +446,6 @@ Distributions within the quadratic exponential family include the \emph{tensor n





\section{The Generalized MultiLinear Model}





%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%










% In model \eqref{eq:quaddensity}, the relationship of $\ten{X}$ and $Y$ is absorbed in $\mat{\eta}_y$, and $\mat{t}(\ten{X})$ is the minimal sufficient statistic for the pseudoparameter $\mat{\eta}_y = (\vec\mat{\eta}_1, \vec\mat{\eta}_2)$ with $\mat{t}(\ten{X}) = (\vec{\ten{X}}, \t{\mat{D}_p}\mat{D}_p\vech(\ten{X}\circ\ten{X}))$ where $\mat{D}_p$ is the duplication matrix \textcite[Ch.~11]{MatrixAlgebraAbadirMagnus2005}, for $p = \prod_{j = 1}^r p_j$. $\t{\mat{D}_p}\mat{D}_p$ acts on $\vech(\ten{X}\circ\ten{X})$ by multiplying the off diagonal components of $\vech(\ten{X}\circ\ten{X})$ with $2$ while keeping the diagonal components untouched.





% The duplication matrix ensures the natural parameter vector $\mat{\eta}_y$ is minimal.










In model \eqref{eq:quaddensity}, the relationship of $\ten{X}$ and $Y$ is absorbed in $\mat{\eta}_y$, and $\mat{t}(\ten{X})$ is the minimal sufficient statistic for the \textit{pseudo}parameter\footnote{$\mat{\eta}_y$ is a function of the response $Y$, thus it is not a parameter in the formal statistical sense. It is considered as a parameter when using the equivalence in \eqref{eq:inverseregressionsdr} and view $Y$ as a parameter as a device to derive the sufficient reduction from the inverse regression.} $\mat{\eta}_y = (\mat{\eta}_{1y}, \mat{\eta}_{2y})$ with





\begin{align}\label{eq:tstat}





\mat{t}(\ten{X}) &= (\mat{t}_1(\ten{X}),\mat{t}_2(\ten{X}))=(\vec{\ten{X}}, \mat{T}_2\vech((\vec\ten{X})\t{(\vec\ten{X})})),




@ 455,7 +454,6 @@ where the $d\times p(p + 1) / 2$ dimensional matrix $\mat{T}_2$ with $p = \prod_










We can reexpress the exponent in \eqref{eq:quaddensity} as





\begin{align*}





% \t{\mat{\eta}_y} \mat{t}(\ten{X}) = \langle \ten{X}, \mat{\eta}_1 \rangle + \langle \ten{X}\circ\ten{X}, \mat{D}_p\mat{\eta}_2 \rangle





\t{\mat{\eta}_y} \mat{t}(\ten{X})





&= \langle \vec \ten{X}, \mat{\eta}_{1y} \rangle + \langle \mat{T}_2\vech(\ten{X}\circ\ten{X}), \mat{\eta}_{2y} \rangle = \langle \vec \ten{X}, \mat{\eta}_{1y} \rangle + \langle \vec(\ten{X}\circ\ten{X}), \t{(\mat{T}_2\pinv{\mat{D}_p})}\mat{\eta}_{2y} \rangle





\end{align*}




@ 465,26 +463,23 @@ where $\mat{D}_p$ is the \emph{duplication matrix} from \textcite[Ch.~11]{Matrix





\end{equation}





The exponential family in \eqref{eq:quadraticexpfam} is easily generalizable to any order. This, though, would result in the number of parameters becoming prohibitive to estimate. This is also the reason why we opted for the second order exponential family in our formulation.










By the equivalence in \eqref{eq:inverseregressionsdr}, in order to find the sufficient reduction $\ten{R}(\ten{X})$ we need to infer $\mat{\eta}_{1y}$, and $\mat{\eta}_{2y}$. This is reminiscent of generalized linear modeling, which we extend to a multilinear formulation next.





Suppose $\ten{F}_y$ is a known mapping of $y$ %with values in $\mathbb{R}^{q_1\times ...\times q_r}$





with zero expectation $\E_Y\ten{F}_Y = 0$. We assume the dependence of $\ten{X}$ and $Y$ is reflected only in the first parameter and let





By the equivalence in \eqref{eq:inverseregressionsdr}, in order to find the sufficient reduction $\ten{R}(\ten{X})$ we need to infer $\mat{\eta}_{1y}$, and $\mat{\eta}_{2y}$. This is reminiscent of generalized linear modeling, which we extend to a multilinear formulation next.





Suppose $\ten{F}_y$ is a known mapping of $y$ with zero expectation $\E_Y\ten{F}_Y = 0$. We assume the dependence of $\ten{X}$ and $Y$ is reflected only in the first parameter and let





\begin{align}





\mat{\eta}_{1y} &= \vec{\overline{\ten{\eta}}} + \mat{B}\vec\ten{F}_y, \label{eq:eta1manifold} \\





\mat{\eta}_{2} &= \t{(\pinv{(\mat{T}_2\pinv{\mat{D}_p})})}\vec(c\,\mat{\Omega}), \label{eq:eta2manifold}





\end{align}





where $\overline{\ten{\eta}}\in\mathbb{R}^{p_1\times\ldots\times p_r}$, $\mat{\Omega} \in \mathbb{R}^{p \times p}$ is positive definite with $p = \prod_{j = 1}^{r} p_j$, and $c\in\mathbb{R}$ is a known constant determined by the distribution to ease modeling. That is, we assume that only $\mat{\eta}_{1y}$ depends on $Y$ through $\mat{B}$. The second parameter $\mat{\eta}_2$ captures the second order interaction structure of $\ten{X}$, which we assume not to depend on the response $Y$. In order to relate individual modes of $\ten{X}$ to the response, allowing flexibility in modeling, we assume $\ten{F}_y$ takes values in $\mathbb{R}^{q_1\times ...\times q_r}$; that is, $\ten{F}_y$ is a tensor valued independent variable in the inverse regression setting. This, in turn, leads to imposing corresponding tensor structure to the regression parameter $\mat{B}$. Thus, \eqref{eq:eta1manifold} becomes





where $\overline{\ten{\eta}}\in\mathbb{R}^{p_1\times\ldots\times p_r}$, $\mat{\Omega} \in \mathbb{R}^{p \times p}$ is positive definite with $p = \prod_{j = 1}^{r} p_j$, and $c\in\mathbb{R}$ is a known constant determined by the distribution to ease modeling. That is, we assume that only $\mat{\eta}_{1y}$ depends on $Y$ through $\mat{B}$. The second parameter $\mat{\eta}_2$ captures the second order interaction structure of $\ten{X}$, which we assume not to depend on the response $Y$. In order to relate individual modes of $\ten{X}$ to the response, allowing flexibility in modeling, we assume $\ten{F}_y$ takes values in $\mathbb{R}^{q_1\times ...\times q_r}$; that is, $\ten{F}_y$ is a tensor valued independent variable in the inverse regression setting. This, in turn, leads to imposing corresponding tensor structure to the regression parameter $\mat{B}$. Thus, \eqref{eq:eta1manifold} becomes





\begin{align}





\mat{\eta}_{1y} &= %\equiv





\mat{\eta}_{1y} &=





\vec\biggl(\overline{\ten{\eta}} + \ten{F}_y\mlm_{j = 1}^{r}\mat{\beta}_j\biggr), \label{eq:eta1}





\end{align}





where $\mat{B} = \bigotimes_{j = r}^{1}\mat{\beta}_j$ and the component matrices $\mat{\beta}_j\in\mathbb{R}^{p_j\times q_j}$ are of known rank for $j = 1, \ldots, r$. Given the high potential value of $p$, we further assume that





where $\mat{B} = \bigotimes_{j = r}^{1}\mat{\beta}_j$ and the component matrices $\mat{\beta}_j\in\mathbb{R}^{p_j\times q_j}$ are of known rank for $j = 1, \ldots, r$. Given the high potential value of $p$, we further assume that





\begin{align}





\t{(\mat{T}_2\pinv{\mat{D}_p})}\mat{\eta}_{2y}= \t{(\mat{T}_2\pinv{\mat{D}_p})}\mat{\eta}_{2} &= \vec\biggl(c\bigotimes_{j = r}^{1}\mat{\Omega}_j\biggr). \label{eq:eta2}





\end{align}





where $\mat{\Omega}_j\in\mathbb{R}^{p_j\times p_j}$ are symmetric positive definite matrices for $j = 1, \ldots, r$. That is, we require $\mat{\Omega} = \bigotimes_{j = r}^{1}\mat{\Omega}_j$, which substantially reduces the number of parameters to estimate in $\mat{\Omega}$. The assumption that the $\mat{\Omega}_j$'s be positive definite is possible due to the constant $c$.





Equation \eqref{eq:eta2} is underdetermined since $\t{(\mat{T}_2\pinv{\mat{D}_p})}$ has full column rank $d < p^2$ (with a nonstrict inequality if $\ten{X}$ is univariate) but $\mat{\eta}_2$ is uniquely determined given any $\mat{\Omega}$ as $\t{(\pinv{(\mat{T}_2\pinv{\mat{D}_p})})}$ has full row rank. We let $\mat{\xi} = (\vec{\overline{\ten{\eta}}}, \vec{\mat{B}}, \vech{\mat{\Omega}})$ be a $p(p + 2 q + 3) / 2$parameter vector and $\mat{\theta} = (\vec{\overline{\ten{\eta}}}, \vec{\mat{B}}, \vech{\mat{\Omega}})$ be the constrained parameter vector, where $\mat{B}=\bigotimes_{j = r}^{1}\mat{\beta}_j$ and $\mat{\Omega} = \bigotimes_{j = r}^{1}\mat{\Omega}_j$. We also let $\Xi$ and $\Theta$ denote the unconstrained and constrained parameter spaces, with $\mat{\xi}$ and $\mat{\theta}$ varying in $\Xi$ and $\Theta$, respectively. The parameter space $\Xi$ is an open subset of $\mathbb{R}^{p(p + 2 q + 3) / 2}$ so that \eqref{eq:quadraticexpfam} is a proper density.










\todo{Maybe already here introduce the ``constraint'' set of $\Omega$'s allowed as $\{ \Omega\in\mathbb{R}_{++}^{p\times p} : \vec{\Omega} = \t{(\pinv{(\mat{T}_2\pinv{\mat{D}_p})}\mat{T}_2\pinv{\mat{D}_p})}\vec{\Omega} \}$}





where $\mat{\Omega}_j\in\mathbb{R}^{p_j\times p_j}$ are symmetric positive definite matrices for $j = 1, \ldots, r$. That is, we require $\mat{\Omega} = \bigotimes_{j = r}^{1}\mat{\Omega}_j$, which substantially reduces the number of parameters to estimate in $\mat{\Omega}$. The assumption that the $\mat{\Omega}_j$'s be positive definite is possible due to the constant $c$.





Equation \eqref{eq:eta2} is underdetermined since $\t{(\mat{T}_2\pinv{\mat{D}_p})}$ has full column rank $d < p^2$ (with a nonstrict inequality if $\ten{X}$ is univariate) but $\mat{\eta}_2$ is uniquely determined given any $\mat{\Omega}$ as $\t{(\pinv{(\mat{T}_2\pinv{\mat{D}_p})})}$ has full row rank. We let $\mat{\xi} = (\vec{\overline{\ten{\eta}}}, \vec{\mat{B}}, \vech{\mat{\Omega}})$ be a $p(p + 2 q + 3) / 2$parameter vector and $\mat{\theta} = (\vec{\overline{\ten{\eta}}}, \vec{\mat{B}}, \vech{\mat{\Omega}})$ be the constrained parameter vector, where $\mat{B}=\bigotimes_{j = r}^{1}\mat{\beta}_j$ and $\mat{\Omega} = \bigotimes_{j = r}^{1}\mat{\Omega}_j$. We also let $\Xi$ and $\Theta$ denote the unconstrained and constrained parameter spaces, with $\mat{\xi}$ and $\mat{\theta}$ varying in $\Xi$ and $\Theta$, respectively. The parameter space $\Xi$ is an open subset of $\mathbb{R}^{p(p + 2 q + 3) / 2}$ so that \eqref{eq:quadraticexpfam} is a proper density. Later, we relax the assumptions for $\mat{\beta}_k$ and $\mat{\Omega}_k$ as a consequence of \cref{thm:parammanifold} in \cref{sec:kronmanifolds}.










In a classical \emph{generalized linear model} (GLM), the link function connecting the natural parameters to the expectation of the sufficient statistic $\mat{\eta}_y = \mat{g}(\E[\mat{t}(\ten{X}) \mid Y = y])$ is invertible. Such a link may not exist in our setting, but for our purpose what we call the ``inverse'' link suffices. The ``inverse'' link $\widetilde{\mat{g}}$ exists as the natural parameters fully describe the distribution. As in the nonminimal formulation \eqref{eq:quadraticexpfam}, we define the ``inverse'' link through its tensor valued components





\begin{align}




@ 495,7 +490,7 @@ as $\widetilde{\mat{g}}(\mat{\eta}_y) = (\vec\ten{g}_1(\mat{\eta}_y), \vec\ten{g










Under the quadratic exponential family model \eqref{eq:quadraticexpfam}, a sufficient reduction for the regression of $Y$ on $\ten{X}$ is given in \cref{thm:sdr}.





\begin{theorem}[SDR]\label{thm:sdr}





A sufficient reduction for the regression $Y\mid \ten{X}$ under the quadratic exponential family inverse regression model \eqref{eq:quadraticexpfam} with natural parameters modeled \eqref{eq:eta1} and \eqref{eq:eta2} is given by





A sufficient reduction for the regression $Y\mid \ten{X}$ under the quadratic exponential family inverse regression model \eqref{eq:quadraticexpfam} with natural parameters \eqref{eq:eta1} and \eqref{eq:eta2} is given by





\begin{align}\label{eq:sdr}





\ten{R}(\ten{X})





% &= (\ten{X}  \E\ten{X})\times\{ \t{\mat{\beta}_1}, \ldots, \t{\mat{\beta}_r} \}.




@ 551,10 +546,10 @@ The maximum likelihood estimate of $\mat{\theta}_0$ is the solution to the optim





\end{equation}





with $\hat{\mat{\theta}}_n = (\vec\widehat{\overline{\ten{\eta}}}, \vec\widehat{\mat{B}}, \vech\widetilde{\mat{\Omega}})$ where $\widehat{\mat{B}} = \bigkron_{k = r}^{1}\widehat{\mat{\beta}}_k$ and $\widehat{\mat{\Omega}} = \bigkron_{k = r}^{1}\widehat{\mat{\Omega}}_k$.










A straightforward and general method for parameter estimation is \emph{gradient descent}. For applying gradient based optimization, we compute the gradients of $l_n$ in \cref{thm:grad}.





A straightforward and general method for parameter estimation is \emph{gradient descent}. To apply gradient based optimization, we compute the gradients of $l_n$ in \cref{thm:grad}.










\begin{theorem}\label{thm:grad}





For $n$ i.i.d. observations $(\ten{X}_i, y_i), i = 1, ..., n$ the loglikelihood has the form \eqref{eq:loglikelihood} with $\mat{\theta}$ being the collection of all GMLM parameters $\overline{\ten{\eta}}$, ${\mat{B}} = \bigkron_{k = r}^{1}{\mat{\beta}}_k$ and ${\mat{\Omega}} = \bigkron_{k = r}^{1}{\mat{\Omega}}_k$ for $k = 1, ..., r$. Furthermore, let $\ten{G}_2(\mat{\eta}_y)$ be a tensor of dimensions $p_1, \ldots, p_r$ such that





For $n$ i.i.d. observations $(\ten{X}_i, y_i), i = 1, ..., n$ the loglikelihood is of the form in \eqref{eq:loglikelihood} with $\mat{\theta}$ being the collection of all GMLM parameters $\overline{\ten{\eta}}$, ${\mat{B}} = \bigkron_{k = r}^{1}{\mat{\beta}}_k$ and ${\mat{\Omega}} = \bigkron_{k = r}^{1}{\mat{\Omega}}_k$ for $k = 1, ..., r$. Let $\ten{G}_2(\mat{\eta}_y)$ be a tensor of dimensions $p_1, \ldots, p_r$ such that





\begin{displaymath}





\vec{\ten{G}_2(\mat{\eta}_y)} = \pinv{(\mat{T}_2\pinv{\mat{D}_p})}\mat{T}_2\pinv{\mat{D}_p}\vec{\ten{g}_2(\mat{\eta}_y)}.





\end{displaymath}




@ 564,49 +559,47 @@ A straightforward and general method for parameter estimation is \emph{gradient





\nabla_{\mat{\beta}_j}l_n &= \vec\frac{1}{n}\sum_{i = 1}^n (\ten{X}_i  \ten{g}_1(\mat{\eta}_{y_i}))_{(j)}\t{\Big(\ten{F}_{y_i}\mlm_{k\in[r]\backslash j}\mat{\beta}_k\Big)_{(j)}}, \\





\nabla_{\mat{\Omega}_j}l_n &= \vec\frac{c}{n}\sum_{i = 1}^n (\ten{X}_i\otimes\ten{X}_i  \K(\ten{G}_2(\mat{\eta}_{y_i})))\mlm_{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})}





\end{align*}





giving the gradient $\nabla l_n = (\nabla_{\overline{\ten{\eta}}}l_n, \nabla_{\mat{\beta}_1}l_n, \ldots, \nabla_{\mat{\beta}_r}l_n, \nabla_{\mat{\Omega}_1}l_n, \ldots, \nabla_{\mat{\Omega}_r}l_n)$.










If $\mat{T}_2$ is the identity matrix $\mat{I}_{p(p + 1) / 2}$, then $\ten{G}_2(\mat{\eta}_y) = \ten{g}_2(\mat{\eta}_y)$.





which obtains $\nabla l_n = (\nabla_{\overline{\ten{\eta}}}l_n, \nabla_{\mat{\beta}_1}l_n, \ldots, \nabla_{\mat{\beta}_r}l_n, \nabla_{\mat{\Omega}_1}l_n, \ldots, \nabla_{\mat{\Omega}_r}l_n)$.





If $\mat{T}_2$ is the identity matrix $\mat{I}_{p(p + 1) / 2}$, then $\ten{G}_2(\mat{\eta}_y) = \ten{g}_2(\mat{\eta}_y)$.





\end{theorem}










Although the general case of any GMLM model can be fitted via gradient descent using \cref{thm:grad}, this may be very inefficient. In case of the tensor normal distribution an iterative cyclic updating scheme can be derived by setting the partial gradients of \cref{thm:grad} to zero. Described in more details in \cref{sec:tensor_normal_estimation}, this method has much faster convergence, is stabel and does not requires any hyper parameters. On the other hand, the Ising model does not allow such a scheme. There we need to use a gradient based method, which is the subject of \cref{sec:ising_estimation}.





Although the general case of any GMLM model can be fitted via gradient descent using \cref{thm:grad}, this may be very inefficient. In \cref{thm:grad}, $\mat{T}_2$ can be used to introduce flexible second moment structures. For example, it allows modeling effects differently for predictor components, as described in \cref{sec:ising_estimation} after Eqn. \eqref{eq:isingcondprob}. In the remainder, we focus on $\mat{T}_2$'s that are identity matrices. This approach simplifies the estimation algorithm and the speed of the numerical calculation in the case of tensor normal predictors. In the case of the tensor normal distribution, an iterative cyclic updating scheme is derived in \cref{sec:tensor_normal_estimation}, which has much faster convergence, is stable and does not require hyper parameters, as will be discussed later. On the other hand, the Ising model does not allow such a scheme. There we need to use a gradient based method, which is the subject of \cref{sec:ising_estimation}.










%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%





\subsection{Tensor Normal}\label{sec:tensor_normal_estimation}





%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%





With $\ten{X}\mid Y = y$ following a tensor normal distribution, its density, with mean $\ten{\mu}_y$ and covariance $\mat{\Sigma} = \bigkron_{k = r}^{1}\mat{\Sigma}_k$ is given by





Suppose $\ten{X}\mid Y = y$ follows a tensor normal distribution with mean $\ten{\mu}_y$ and covariance $\mat{\Sigma} = \bigkron_{k = r}^{1}\mat{\Sigma}_k$. We assume the distribution is nondegenerate which means that the covariances $\mat{\Sigma}_k$ are symmetric positive definite matrices. Its density is given by





\begin{displaymath}





f_{\mat{\theta}}(\ten{X}\mid Y = y) = (2\pi)^{p / 2}\prod_{k = 1}^{r}\det(\mat{\Sigma}_k)^{p / 2 p_k}\exp\left( \frac{1}{2}\left\langle\ten{X}  \ten{\mu}_y, (\ten{X}  \ten{\mu}_y)\mlm_{k = 1}^{r}\mat{\Sigma}_k^{1} \right\rangle \right).





\end{displaymath}





We assume the distribution is nondegenerate which means that the covariances $\mat{\Sigma}_k$ are symmetric positive definite matrices. For the sake of simplicity, we w.l.o.g. assume $\ten{X}$ to have marginal expectation equal zero, $\E\ten{X} = 0$. Rewriting this in the quadratic exponential family form \eqref{eq:quadraticexpfam}, determines tha scaling constant $c = 1/2$, and give the relation to the GMLM parameters $\overline{\ten{\eta}}, \mat{\beta}_k$ and $\mat{\Omega}_k$, for $k = 1, \ldots, r$ as





For the sake of simplicity and w.l.o.g., we assume $\ten{X}$ has 0 marginal expectation; i.e., $\E\ten{X} = 0$. Rewriting this in the quadratic exponential family form \eqref{eq:quadraticexpfam}, determines the scaling constant $c = 1/2$. The relation to the GMLM parameters $\overline{\ten{\eta}}, \mat{\beta}_k$ and $\mat{\Omega}_k$, for $k = 1, \ldots, r$ is





\begin{displaymath}





\ten{\mu}_y = \ten{F}_y\mlm_{k = 1}^{r}\mat{\Omega}_k^{1}\mat{\beta}_k, \qquad





\mat{\Omega}_k = \mat{\Sigma}_k^{1}





\mat{\Omega}_k = \mat{\Sigma}_k^{1},





\end{displaymath}





where we used that $\overline{\ten{\eta}} = 0$ due to $0 = \E\ten{X} = \E\E[\ten{X}\mid Y] = \E\ten{\mu}_Y$ in combination with $\E\ten{F}_Y = 0$. Additionally, all the $\mat{\Omega}_k$'s are symmetric positive definite, because the $\mat{\Sigma}_k$'s are. This lead to another simplification the matrix $\mat{T}_2$ from \eqref{eq:tstat} is equal to the identity. This also means that the gradients of the loglikelihood $l_n$ in \cref{thm:grad} are simpler as well. We get





where we used that $\overline{\ten{\eta}} = 0$ due to $0 = \E\ten{X} = \E\E[\ten{X}\mid Y] = \E\ten{\mu}_Y$ in combination with $\E\ten{F}_Y = 0$. Additionally, all the $\mat{\Omega}_k$'s are symmetric positive definite, because the $\mat{\Sigma}_k$'s are. This lead to another simplification since then $\mat{T}_2$ in \eqref{eq:tstat} equals the identity. This also means that the gradients of the loglikelihood $l_n$ in \cref{thm:grad} are simpler. We obtain





\begin{displaymath}





\ten{g}_1(\mat{\eta}_y) = \E[\ten{X}\mid Y = y] = \ten{\mu}_y, \qquad





\ten{G}_2(\mat{\eta}_y) = \ten{g}_2(\mat{\eta}_y) = \E[\ten{X}\circ\ten{X}\mid Y = y] \equiv \bigkron_{k = r}^1\mat{\Sigma}_k + (\vec{\ten{\mu}}_y)\t{(\vec{\ten{\mu}}_y)}.





\end{displaymath}










In practice, we start the estimation process by demeaning a given data set $(\ten{X}_i, \ten{F}_{y_i})$ with $i = 1, \ldots, n$ observations. After demenaing, only the reduction matrices $\mat{\beta}_k$ and the scatter matrices $\mat{\Omega}_k$ need to be estimated. Then, to solve the optimization problem \eqref{eq:mle}, with $\overline{\ten{\eta}} = 0$ after demeaning, we need some initial values. For initial estimates $\hat{\mat{\beta}}_k^{(0)}$ we use a simple heuristic approach. First, we compute moment based modewise marginal covariance estimates $\widehat{\mat{\Sigma}}_k(\ten{X})$ and $\widehat{\mat{\Sigma}}_k(\ten{F}_Y)$ given by





In practice, we assume we have a random sample of $n$ observations $(\ten{X}_i, \ten{F}_{y_i})$ from the joint distribution. We start the estimation process by demeaning them. Then, only the reduction matrices $\mat{\beta}_k$ and the scatter matrices $\mat{\Omega}_k$ need to be estimated. To solve the optimization problem \eqref{eq:mle}, with $\overline{\ten{\eta}} = 0$ we initialize the parameters using a simple heuristic approach. First, we compute moment based modewise marginal covariance estimates $\widehat{\mat{\Sigma}}_k(\ten{X})$ and $\widehat{\mat{\Sigma}}_k(\ten{F}_Y)$ as





\begin{displaymath}





\widehat{\mat{\Sigma}}_k(\ten{X}) = \frac{1}{n}\sum_{i = 1}^{n} (\ten{X}_i)_{(k)}\t{(\ten{X}_i)_{(k)}}, \qquad





\widehat{\mat{\Sigma}}_k(\ten{X}) = \frac{1}{n}\sum_{i = 1}^{n} (\ten{X}_i)_{(k)}\t{(\ten{X}_i)_{(k)}}, \quad





\widehat{\mat{\Sigma}}_k(\ten{F}_Y) = \frac{1}{n}\sum_{i = 1}^{n} (\ten{F}_{y_i})_{(k)}\t{(\ten{F}_{y_i})_{(k)}}.





\end{displaymath}





Then, for every mode $k = 1, \ldots, r$, we compute the first $j = 1, \ldots, q_k$ eigen vectors $\mat{v}_j(\widehat{\mat{\Sigma}}_k(\ten{X}))$, $\mat{v}_j(\widehat{\mat{\Sigma}}_k(\ten{F}_Y))$ and eigen values $\lambda_j(\widehat{\mat{\Sigma}}_k(\ten{X}))$, $\lambda_j(\widehat{\mat{\Sigma}}_k(\ten{X}))$ of the marginal covariance estimates. Let,





Then, for every mode $k = 1, \ldots, r$, we compute the first $j = 1, \ldots, q_k$ eigenvectors $\mat{v}_j(\widehat{\mat{\Sigma}}_k(\ten{X}))$, $\mat{v}_j(\widehat{\mat{\Sigma}}_k(\ten{F}_Y))$ and eigenvalues $\lambda_j(\widehat{\mat{\Sigma}}_k(\ten{X}))$, $\lambda_j(\widehat{\mat{\Sigma}}_k(\ten{X}))$ of the marginal covariance estimates. We set





\begin{align*}





\mat{U}_k &= (\mat{v}_1(\widehat{\mat{\Sigma}}_1(\ten{X})), \ldots, \mat{v}_{q_k}(\widehat{\mat{\Sigma}}_{q_k}(\ten{X}))), \\





\mat{D}_k &= \diag(\mat{v}_1(\widehat{\mat{\Sigma}}_1(\ten{X}))\mat{v}_1(\widehat{\mat{\Sigma}}_1(\ten{F}_{Y})), \ldots, \mat{v}_{q_k}(\widehat{\mat{\Sigma}}_{q_k}(\ten{X}))\mat{v}_{q_k}(\widehat{\mat{\Sigma}}_k(\ten{F}_{Y}))), \\





\mat{V}_k &= (\mat{v}_1(\widehat{\mat{\Sigma}}_1(\ten{F}_Y), \ldots, \mat{v}_{q_k}(\widehat{\mat{\Sigma}}_{q_k}(\ten{F}_Y)). \\





\end{align*}





The initial estimates are then set to





The initial value of $\mat{\beta}_k$ is





\begin{displaymath}





\hat{\mat{\beta}}_k^{(0)} = \mat{U}_k\sqrt{\mat{D}_k}\t{\mat{V}_k}.





\hat{\mat{\beta}}_k^{(0)} = \mat{U}_k\sqrt{\mat{D}_k}\t{\mat{V}_k},





\end{displaymath}





The initial values of $\mat{\Omega}_k$ are set to the identity $\mat{\Omega}_k^{(0)} = \mat{I}_{p_k}$.





and the initial value of $\mat{\Omega}_k$ is set to the identity $\mat{\Omega}_k^{(0)} = \mat{I}_{p_k}$, for $k=1,\ldots,r$.










Given any estimates $\hat{\mat{\beta}}_1, \ldots, \hat{\mat{\beta}}_r, \hat{\mat{\Omega}}_1, \ldots, \hat{\mat{\Omega}}_r$, we take the gradient $\nabla_{\mat{\beta}_j}$ of the loglikelihood $l_n$ in \eqref{eq:loglikelihood} of the tensor normal distributed from \cref{thm:grad} and keep all other parameters except $\mat{\beta}_j$ fixed. Then, $\nabla_{\mat{\beta}_j}l_n = 0$ has a closed form solution





Given $\hat{\mat{\beta}}_1, \ldots, \hat{\mat{\beta}}_r, \hat{\mat{\Omega}}_1, \ldots, \hat{\mat{\Omega}}_r$, we take the gradient $\nabla_{\mat{\beta}_j}l_n$ of the tensor normal loglikelihood $l_n$ in \eqref{eq:loglikelihood} applying \cref{thm:grad} and keep all other parameters except $\mat{\beta}_j$ fixed. Then, $\nabla_{\mat{\beta}_j}l_n = 0$ has the closed form solution





\begin{equation}\label{eq:tensor_normal_beta_solution}





\t{\mat{\beta}_j} = \biggl(





\sum_{i = 1}^{n}




@ 620,43 +613,133 @@ Given any estimates $\hat{\mat{\beta}}_1, \ldots, \hat{\mat{\beta}}_r, \hat{\mat





\biggr)





\hat{\mat{\Omega}}_j.





\end{equation}





For the scatter matrices $\mat{\Omega}_j$, we need to fudge a bit. Equating the partial gradient of the $j$'th scatter matrix $\mat{\Omega}_j$ in \cref{thm:grad} to zero, $\nabla_{\mat{\Omega}_j}l_n = 0$, gives a quadratic matrix equation. This is due to the dependence of $\ten{\mu}_y$ on $\mat{\Omega}_j$. In practive thoug, it is more stable, faster and equaly accurate to use modewise covariance estimates via the residuals





Equating the partial gradient of the $j$th scatter matrix $\mat{\Omega}_j$ in \cref{thm:grad} to zero ( $\nabla_{\mat{\Omega}_j}l_n = 0$) gives a quadratic matrix equation. This is due to the dependence of $\ten{\mu}_y$ on $\mat{\Omega}_j$. In practice though, it is faster, more stable, and equally accurate to use modewise covariance estimates via the residuals





\begin{displaymath}





\hat{\ten{R}}_i = \ten{X}_i  \hat{\ten{\mu}}_{y_i} = \ten{X}_i  \ten{F}_{y_i}\mlm_{k = 1}^{r}\hat{\mat{\Omega}}_k^{1}\hat{\mat{\beta}}_k.





\end{displaymath}





The estimates are computed via the inbetween values





\begin{displaymath}





\tilde{\mat{\Sigma}}_j = \sum_{i = 1}^n (\hat{\ten{R}}_i)_{(j)} \t{(\hat{\ten{R}}_i)_{(j)}}





\end{displaymath}





which only lacks scaling $\tilde{s}\tilde{\mat{\Sigma}}_j = \hat{\mat{\Omega}}_j^{1}$. The scaling condition is that the mean squared error has to be equal to the trace of the covariance estimate,





The estimates are computed via





\begin{displaymath}





\frac{1}{n}\sum_{i = 1}^n \langle \hat{\ten{R}}_i, \hat{\ten{R}}_i \rangle = \tr\bigkron_{k = r}^{1}\hat{\mat{\Omega}}_k^{1} = \prod_{k = 1}^{r}\tr{\hat{\mat{\Omega}}_k^{1}} = \tilde{s}^r\prod_{k = 1}^{r}\tr{\tilde{\mat{\Sigma}}_k}.





\tilde{\mat{\Sigma}}_j = \sum_{i = 1}^n (\hat{\ten{R}}_i)_{(j)} \t{(\hat{\ten{R}}_i)_{(j)}},





\end{displaymath}





Solving for the scaling value gives





where $\tilde{s}\tilde{\mat{\Sigma}}_j = \hat{\mat{\Omega}}_j^{1}$. For scaling we use that the mean squared error has to be equal to the trace of the covariance estimate,





\begin{displaymath}





\frac{1}{n}\sum_{i = 1}^n \langle \hat{\ten{R}}_i, \hat{\ten{R}}_i \rangle = \tr\bigkron_{k = r}^{1}\hat{\mat{\Omega}}_k^{1} = \prod_{k = 1}^{r}\tr{\hat{\mat{\Omega}}_k^{1}} = \tilde{s}^r\prod_{k = 1}^{r}\tr{\tilde{\mat{\Sigma}}_k},





\end{displaymath}





so that





\begin{displaymath}





\tilde{s} = \biggl(\Bigl(\prod_{k = 1}^{r}\tr{\tilde{\mat{\Sigma}}_k}\Bigr)^{1}\frac{1}{n}\sum_{i = 1}^n \langle \hat{\ten{R}}_i, \hat{\ten{R}}_i \rangle\biggr)^{1 / r}





\end{displaymath}





resulting in the estimates $\hat{\mat{\Omega}}_j = (\tilde{s}\tilde{\mat{\Sigma}}_j)^{1}$.





Estimation is then performed by updating the estimates $\hat{\mat{\beta}}_j$ via \eqref{eq:tensor_normal_beta_solution} for $j = 1, \ldots, r$, and then recompute the $\hat{\mat{\Omega}}_j$ estimates simultaneously keeping the $\hat{\mat{\beta}}_j$'s fixed. This procedure is repeated until convergence. % Convergence is very fast, experiments showed that convergence occures usualy in less than $10$ iterations.










Estimation is then performed by updating for $j = 1, \ldots, r$ the estimates $\hat{\mat{\beta}}_j$ via \eqref{eq:tensor_normal_beta_solution}, then we recompute the $\hat{\mat{\Omega}}_j$ estimates simultaniously keeping the $\hat{\mat{\beta}}_j$'s fixed. This procedure is repeated untill convergence. Convergence is very fast, experiments showed that convergence occures usualy in less than $10$ iterations.





A technical detail for numerical stability is to ensure that the scaled values $\tilde{s}\tilde{\mat{\Sigma}}_j$, assumed to be symmetric and positive definite, are well conditioned. Thus, we estimate the condition number of $\tilde{s}\tilde{\mat{\Sigma}}_j$ prior to computing the inverse. In case of illconditioning, we use the regularized $\hat{\mat{\Omega}}_j = (\tilde{s}\tilde{\mat{\Sigma}}_j + 0.2 \lambda_{1}(\tilde{s}\tilde{\mat{\Sigma}}_j)\mat{I}_{p_j})^{1}$ instead, where $\lambda_{1}(\tilde{s}\tilde{\mat{\Sigma}}_j)$ is the first (maximum) eigenvalue. Experiments showed that this regularization is usually only required in the first few iterations.










A technical detail for numerical stability is to ensure that the scaled inbetween values $\tilde{s}\tilde{\mat{\Sigma}}_j$, assumed to be symmetric and positive definite, are well conditioned. Otherwise, the procedure can be numerically unstable by attempting to compute $\hat{\mat{\Omega}}_j = (\tilde{s}\tilde{\mat{\Sigma}}_j)^{1}$. Therefore, we estimate the condition number of $\tilde{s}\tilde{\mat{\Sigma}}_j$ before computing the inverse. In case of being ill conditioned, we regularize and use $\hat{\mat{\Omega}}_j = (\tilde{s}\tilde{\mat{\Sigma}}_j + 0.2 \lambda_{1}(\tilde{s}\tilde{\mat{\Sigma}}_j)\mat{I}_{p_j})^{1}$ instead, where $\lambda_{1}(\tilde{s}\tilde{\mat{\Sigma}}_j)$ is the first (maximum) eigen value. Experiments showed that this regularization is usually required in the first few iterations only.










Furthermore, we may assume that the parameter space follows a more generel setting as in \cref{thm:parammanifold}. In this case, updating may produces estimates outside of the parameter space. A simple and efficient method is to project every updated estimate onto the corresponding manifold.





Furthermore, if the parameter space follows a more general setting as in \cref{thm:parammanifold}, updating may produces estimates outside the parameter space. A simple and efficient method is to project every updated estimate onto the corresponding manifold.










%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%





\subsection{Ising Model}\label{sec:ising_estimation}





%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%





The general distribution of a binary vector is modeled by the \emph{multivariate Bernoulli distribution} (see: \textcite{GraphicalModelsWhittaker2009, MVBDai2012, MVBDaiDingWahba2013}). The \emph{Ising model} \textcite{IsingIsing1924} is a special case, considering only twoway interactions. Its probability mass function (PMF) for a binary random vector $X\in\{ 0, 1 \}^p$ with natural parameters $\mat{\gamma}\in\mathbb{R}^{p(p + 1) / 2}$ is given by





\begin{displaymath}





P_{\mat{\gamma}}(\mat{x}) = p_0(\mat{\gamma})\exp(\t{\vech(\mat{x}\t{\mat{x}})}\mat{\gamma}).





\end{displaymath}





The scaling factor $p_0(\mat{\gamma})\in\mathbb{R}_{+}$ ensures that $P_{\mat{\gamma}}$ is a PMF. It is equal to the probability of the zero event $P(X = \mat{0}) = p_0(\mat{\gamma})$. More commonly known as the \emph{partition function}, the reciprocal of $p_0$, given by





\begin{equation}\label{eq:isingpartitionfunction}





p_0(\mat{\gamma})^{1} = \sum_{\mat{x}\in\{0, 1\}^p}\exp(\t{\vech(\mat{x}\t{\mat{x}})}\mat{\gamma}).





\end{equation}





By an abuse of notation, let $\mat{\gamma}_{j l}$ address the element of $\mat{\gamma}$ corresponding to $\mat{x}_j\mat{x}_l$ in $\vech(\mat{x}\t{\mat{x}})$\footnote{Specifically, the element $\mat{\gamma}_{j l}$ of $\mat{\gamma}$ is a short hand for $\mat{\gamma}_{\iota(j, l)}$ with $\iota(j, l) = (\min(j, l)  1)(2 p  \min(j, l)) / 2 + \max(j, l)$ mapping the matrix row index $j$ and column index $l$ to the corresponding half vectorization indices $\iota(j, l)$.}. The ``diagonal'' parameter $\mat{\gamma}_{j j}$ expresses the conditional log odds of $X_j = 1\mid X_{j} = \mat{0}$, where the negative subscript in $X_{j}$ describes the $p  1$ dimensional vector $X$ with the $j$'t element removed. The ``off diagonal'' parameters $\mat{\gamma}_{j l}$, for $j\neq l$, are equal to the conditional log odds of simultanious occurence $X_j = 1, X_l = 1 \mid X_{j, l} = \mat{0}$. More precise, for $j\neq l$, the conditional probabitities and the natural parameters are related by





\begin{align}





\mat{\gamma}_{j j} &= \log\frac{P_{\mat{\gamma}}(X_j = 1\mid X_{j} = \mat{0})}{1  P_{\mat{\gamma}}(X_j = 1\mid X_{j} = \mat{0})}, \nonumber \\





\mat{\gamma}_{j l} &= \log\frac{1  P_{\mat{\gamma}}(X_j = 1\mid X_{j} = \mat{0})P_{\mat{\gamma}}(X_l = 1\mid X_{l} = \mat{0})}{P_{\mat{\gamma}}(X_j = 1\mid X_{j} = \mat{0})P_{\mat{\gamma}}(X_l = 1\mid X_{l} = \mat{0})}\frac{P_{\mat{\gamma}}(X_j = 1, X_l = 1\mid X_{j, l} = \mat{0})}{1  P_{\mat{\gamma}}(X_j = 1, X_l = 1\mid X_{j, l} = \mat{0})} \label{eq:isingtwowaylogodds}.





\end{align}










Conditional Ising models, incorporating the information of covariates $Y$ into the model, have also been considered \textcite{sparseIsingChengEtAt2014, sdrmixedPredictorsBuraForzaniEtAl2022}. The direct way is to parameterize the parameter $\mat{\gamma} = \mat{\gamma}_y$ by the covariate $Y = y$ to model a conditional distribution $P_{\mat{\gamma}_y}(\mat{x}\mid Y = y)$.










We extend the conditional PMF by allowing the binary variables to be tensor values, that is for $\ten{X}\in\{ 0, 1 \}^{p_1\times\cdots\times p_r}$ we set $\mat{x} = \vec{\ten{X}}$, with dimension $p = \prod_{k = 1}^{r}p_k$. Considering the tensor structure of $\ten{X}$ is done by assuming Kronecker product constraints to the parameter vector $\mat{\gamma}_y$ in a similar fashion as for the tensor normal model. This means that we compair the PMF $P_{\mat{\gamma}_y}(\vec{\ten{X}}  Y = y)$ to the quadratic exponential family \eqref{eq:quadraticexpfam} with the natural parameters modeled by \eqref{eq:eta1} and \eqref{eq:eta2}. A detail to be considered is that the diagonal of $(\vec{\ten{X}})\t{(\vec{\ten{X}})}$ is equal to $\vec{\ten{X}}$. This gives the GMLM model as





\begin{align}





P_{\mat{\gamma}_y}(\ten{X} \mid Y = y)





&= p_0(\mat{\gamma}_y)\exp(\t{\vech((\vec{\ten{X}})\t{(\vec{\ten{X}})})}\mat{\gamma}_y) \nonumber \\





&= p_0(\mat{\gamma}_y)\exp\Bigl(\Bigl\langle \ten{X}, \ten{F}_y\mlm_{k = 1}^{r}\mat{\beta}_k \Bigr\rangle + \Bigl\langle\ten{X}\mlm_{k = 1}^{r}\mat{\Omega}_k, \ten{X}\Bigr\rangle\Bigr)\label{eq:isingcondprob}





\end{align}





where we set $\overline{\ten{\eta}} = 0$ and $\mat{T}_2$ to the identity. This is an additional constraint to the model, the reason is that the diagonal elements of $\mat{\Omega} = \bigkron_{k = r}^{1}\mat{\Omega}_k$ take the role of $\overline{\ten{\eta}}$, althoug not fully. Having the diagonal of $\mat{\Omega}$ and $\overline{\ten{\eta}}$ handling the self interaction effects might lead to interference in the optimization routine. Another approach would be to use the $\mat{T}_2$ matrix to set the corresponding diagonal elements of $\mat{\Omega}$ to zero and let $\overline{\ten{\eta}}$ handle the self interaction effect. All of those approaches, namely setting $\overline{\ten{\eta}} = 0$, keeping $\overline{\ten{\eta}}$ or using $\mat{T}_2$, are theoretically solid and compatible with \cref{thm:grad,thm:parammanifold,thm:asymptoticnormalitygmlm}, assuming all axis dimensions $p_k$ are nondegenerate, that is $p_k > 1$ for all $k = 1, \ldots, r$. Regardles, under our modeling choise we get the relation between the natural parameters $\mat{\gamma}_y$ of the conditional Ising model and the GMLM parameters $\mat{\beta}_k$ and $\mat{\Omega}_k$ as





\begin{equation}\label{eq:isingnaturalparams}





% \t{\pinv{\mat{D}_p}}\mat{\gamma}_y





% = \vec(\mat{\Omega} + \diag(\mat{B}\vec{\ten{F}_y}))





% = \vec\Biggl(\bigkron_{k = r}^{1}\mat{\Omega}_k + \diag\biggl(\vec\Bigl(\ten{F}_y\mlm_{k = 1}^{r}\mat{\beta}_k\Bigr)\biggr)\Biggr).





\mat{\gamma}_y





= \t{\mat{D}_p}\vec(\mat{\Omega} + \diag(\mat{B}\vec{\ten{F}_y}))





= \t{\mat{D}_p}\vec\Biggl(\bigkron_{k = r}^{1}\mat{\Omega}_k + \diag\biggl(\vec\Bigl(\ten{F}_y\mlm_{k = 1}^{r}\mat{\beta}_k\Bigr)\biggr)\Biggr).





\end{equation}










In contract to the tensor normal GMLM, the matrices $\mat{\Omega}_k$ are only required to be symmetric. More specificaly, we require $\mat{\Omega}_k$, for $k = 1, \ldots, r$, to be elements of an embedded submanifold of $\SymMat{p_k}$ (see: \cref{sec:kronmanifolds,sec:matrixmanifolds}). The mode wise reduction matrices $\mat{\beta}_k$ need to be elements of an embedded submanifold of $\mathbb{R}^{p_k\times q_k}$. Common choises are listed in \cref{sec:matrixmanifolds}. \todo{check if we need to exclude zero here!}










To solve the optimization problem \eqref{eq:mle}, given a data set $(\ten{X}_i, y_i)$, for $i = 1, \ldots, n$, we use a variation of gradient descent.










%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%





\subsubsection{Initial Values}










The first step is to get reasonable starting values. Experiments showed that a good starting value of $\mat{\beta}_k$, for $k = 1, \ldots, r$, it to use the tensor normal estimates from \cref{sec:tensor_normal_estimation}, interprating $\ten{X}_i$ as continuous. For initial values of $\mat{\Omega}_k$, a different approach is required. Setting everything to the uninformed initial value, that is $\mat{\Omega}_k = \mat{0}$ as this corresponds to the conditional log odds to be $1:1$ for every component and pairwaide interaction. This is not possible, since $\mat{0}$ is a stationary point of the loglikelihood. This is directly observed by taking a look at the partial gradients of the loglikelihood in \cref{thm:grad}. Instead, we use a crude heuristic which threads every mode seperately and ignores any relation to the covariates. It is computationaly cheap and better than any of the alternatives we considered. For every $k = 1, \ldots, r$, let the $k$'th mode second moment estimate be





\begin{equation}\label{eq:isingmodemoments}





\hat{\mat{M}}_{2(k)} = \frac{p_k}{n p}\sum_{i = 1}^n (\ten{X}_i)_{(k)}\t{(\ten{X}_i)_{(k)}}





\end{equation}





which contains the $k$'th mode first moment estimate in its diagonal $\hat{\mat{M}}_{1(k)} = \diag\hat{\mat{M}}_{2(k)}$. Considering every column of the matricized observation $(\ten{X}_i)_{(k)}$ as a $p_k$ dimensional observation itself. The number of those artifically generated observations is $n \prod_{j\neq k}p_j$. Let $Z_k$ denote the random variable those artifical observations are realization of. Then, we can interprate the elements $(\hat{\mat{M}}_{1(k)})_{j}$ as the estimates of the probability $P((Z_k)_j = 1)$, that is the marginal probability of the $j$th element of $Z_k$ being $1$. Similar, for $l \neq j$ we have $(\hat{\mat{M}}_{2(k)})_{j l}$ estimating $P((Z_k)_j = 1, (Z_k)_l = 1)$, the marginal probability of twoway interactions. % Without any regard of accuracy ...





Now, we set the diagonal elements of $\mat{\Omega}_k$ to zero. For the off diagonal elements of $\mat{\Omega}_k$, we equate the conditional probabilities $P((Z_k)_j = 1 \mid (Z_k)_{j} = \mat{0})$ and $P((Z_k)_j = 1, (Z_k)_l = 1\mid (Z_k)_{j, l} = \mat{0})$ with the marginal probability estimates $(\hat{\mat{M}}_{1(k)})_{j}$ and $(\hat{\mat{M}}_{2(k)})_{j l}$, respectively. Use \eqref{eq:isingtwowaylogodds} then gives the initial estimates $\hat{\mat{\Omega}}_k^{(0)}$, with $j \neq l$ component wise





\begin{equation}\label{eq:isinginitOmegas}





(\hat{\mat{\Omega}}_k^{(0)})_{j j} = 0,





\qquad





(\hat{\mat{\Omega}}_k^{(0)})_{j l} = \log\frac{1  (\hat{\mat{M}}_{1(k)})_{j}(\hat{\mat{M}}_{1(k)})_{l}}{(\hat{\mat{M}}_{1(k)})_{j}(\hat{\mat{M}}_{1(k)})_{l}}\frac{(\hat{\mat{M}}_{2(k)})_{j l}}{1  (\hat{\mat{M}}_{2(k)})_{j l}}.





\end{equation}










%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%





\subsubsection{Gradient Optimization}





Given initial values, the gradients provided by \cref{thm:grad} can be evaluated for the Ising model. The first step therefore is to determin the values of the inverse link components $\ten{g}_1(\mat{\gamma}_y) = \E[\ten{X} \mid Y = y]$ and $\ten{G}_2(\mat{\gamma}_y) = \ten{g}_2(\mat{\gamma}_y) = \E[\ten{X}\circ\ten{X} \mid Y = y]$. An imediate simplification is that the first moment is a part of the second moment. Its values are determined via $\vec(\E[\ten{X} \mid Y = y]) = \diag(\E[\ten{X}\circ\ten{X} \mid Y = y]_{(1, \ldots, r)})$. This means only the second moment needs to be computed, or estimated (see: \cref{sec:isingbiggerdim}) in the case of slightly bigger $p$. For the Ising model, the conditional second moment with parameters $\mat{\gamma}_y$ is given by the matricized relation





\begin{equation}\label{eq:isingm2}





\ten{g}_2(\ten{\gamma}_y)_{(1, \ldots, r)} = \E[(\vec{\ten{X}})\t{(\vec{\ten{X}})}\mid Y = y] = p_0(\mat{\gamma}_y)\sum_{\mat{x}\in\{0, 1\}^{p}}\mat{x}\t{\mat{x}}\exp(\t{\vech(\mat{x}\t{\mat{x}})}\mat{\gamma}_y).





\end{equation}





The natural parameter $\mat{\gamma}_y$ is evaluated via \eqref{eq:isingnaturalparams} enabeling us to compute the partial gradients of the loglikelihood $l_n$ \eqref{eq:loglikelihood} for the Ising model by \cref{thm:grad} for the GMLM parameters $\mat{\beta}_k$ and $\mat{\Omega}_k$, $k = 1, \ldots, r$, at the current iterate $\mat{\theta}^{(I)} = (\mat{\beta}_1^{(I)}, \ldots, \mat{\beta}_r^{(I)}, \mat{\Omega}_1^{(I)}, \ldots, \mat{\Omega}_r^{(I)})$. Using classic gradient ascent for maximizing the loglikelihood, we have to specify a learning rate $\lambda\in\mathbb{R}_{+}$, usualy something like $10^{3}$. The update rule is





\begin{displaymath}





\mat{\theta}^{(I + 1)} = \mat{\theta}^{(I)} + \lambda\nabla_{\mat{\theta}} l_n(\mat{\theta})\bigr_{\mat{\theta} = \mat{\theta}^{(I)}},





\end{displaymath}





which is iterated till convergence. In practice, iteration is performed until ether a maximum number of iterations is exhausted and/or some break condition is satisfied. A proper choise of the learning rate is needed as a too large learning rate $\lambda$ causes instabilities, while a too low learning rate requires an enourmes ammount of iterations. Generically, there are two approach against the need to determine a proper lerning rate. First, \emph{line search methods} determin an appropriate step size for every iteration. This works great if the evaluation of the object function (the loglikelihood) is cheap. This is not the case in our setting, see \cref{sec:isingbiggerdim}. The second approach is an \emph{addaptive learning rate}. The basic idea is to track specific statistics while optimizing and dynamiclly addapt the leaning rate via well tested heuristics using the gathered knowledge from past iterations. We opted to use an addaptive leaning rate approach, this not only levaites the need to determin an approriate leaning rate, but also excelerates learning.










Our method of choise is RMSprop, which stands for \emph{root mean squared propagation} \textcite{rmspropHinton2012}. This is a well known method in maschine learning for training neural networks. Its a variation of gradient descent with an per scalar parameter addaptive learning rate. It tracks a moving average of the element wise squared gradient $\mat{g}_2^{(I)}$, which is then used to scale (element wise) the gradient in the update rule. See \textcite{rmspropHinton2012,deeplearningbookGoodfellowEtAl2016} among others. The update rule using RMSprop for maximization\footnote{Instead of the more common minimization, therefore $+$ in the update of $\mat{\theta}$.} is





\begin{align*}





\mat{g}_2^{(I + 1)} &= \nu \mat{g}_2^{(I)} + (1  \nu)\nabla l_n(\mat{\theta}^{(I)})\odot\nabla l_n(\mat{\theta}^{(I)}), \\





\mat{\theta}^{(I + 1)} &= \mat{\theta}^{(I)} + \frac{\lambda}{\sqrt{\mat{g}_2^{(I + 1)}} + \epsilon}\odot\nabla l_n(\mat{\theta}^{(I)}).





\end{align*}





The parameters $\nu = 0.9$, $\lambda = 10^{3}$ and $\epsilon\approx 1.49\cdot 10^{8}$ are fixed. The initial value of $\mat{g}_2^{(0)} = \mat{0}$, the symbol $\odot$ denotes the Hadamard product, that is the element wise multiplication. The divition and sqaure root operation are performed element wise as well. According to our experiments, RMSprop requires in the range of $50$ till $1000$ iterations till convergence while gradient ascent with a learning rate of $10^{3}$ is in the range of $1000$ till $10000$. \todo{check this!}










%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%





\subsubsection{Small Data Sets}\label{sec:isingsmalldatasets}





In case of a finite number of observations, specifically in data sets with a small number of observations $n$, the situation where one components is always ether zero or one can occure. Its also possible to observe two exclusive components. This situation of a ``degenerate'' data set needs to be saveguarded against in practive. Working with parameters on a logscale, this gives estimates of $\pm\infty$. This is outside of the parameter space and breaks our optimization algorithm.










The first situation where this needs to be addressed is in \eqref{eq:isinginitOmegas}, where we set initial estimates for $\mat{\Omega}_k$. To avoid divition by zero as well as evaluating the log of zero, we addapt \eqref{eq:isingmodemoments}, the mode wise moment estimates $\hat{\mat{M}}_{2(k)}$. A simple method is to replace the ``degenerate'' components, that are entries with value $0$ or $1$, with the smallest positive estimate of exactly one occurence $p_k / n p$, or all but one occurence $1  p_k / n p$, respectively.










The same problem arives in gradient optimization. Therefore, before starting the optimization, we detect degenerate combinations. We compute upper and lower bounds for the ``degenerate'' element in the Kronecker product $\hat{\mat{\Omega}} = \bigkron_{k = r}^{1}\hat{\mat{\Omega}}_k$. After every gradient update, we check if any of the ``degenerate'' elements fall outside of the bounds. In that case, we adjust all the elements of the Kronecker component estimates $\hat{\mat{\Omega}}_k$, corresponding to the ``degenerate'' element of their Kronecker product, to fall inside the precomputed bounds. While doing so, we try to alter every component as little as possible to ensure that the nondegenerate elements in $\hat{\mat{\Omega}}$, effected by this change due to its Kronecker structure, are altered as little as possible. The exact details are technically cumbersome while providing little insight. \todo{For more details we refer the reader to the source code prodived with the supplemental material.}




















%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%





\subsubsection{Slightly Bigger Dimensions}\label{sec:isingbiggerdim}





A big challange with the Ising model is its high computational complexity. The reason is in the sum over all binary vectors of length $p = \prod_{k = 1}^{r}p_k$ in the partition function \eqref{eq:isingpartitionfunction}. Computing the partition function exactly requires to sum all $2^p$ binary vectors. Small dimensions, like $p\approx 10$, this is easily computed. Increasing the dimension bejond $20$ gets extremely expensive while for dimensions bigger than $30$ its absolutely impossible. Trying to avoid the evaluation of the loglikelihood and only computing its partial gradients via \cref{thm:grad} does not resolve the issue. The gradients require the inverse link, in other words the second moment \eqref{eq:isingm2}, where, if dropping the scaling factor $p_0$, still involves to sum $2^p$ summands. Basically, with our model, this means that the optimization of the Ising model using exactly computed gradients is impossible for moderately sized problems.










For estimation of dimensions $p$ bigger than $20$, we use a MonteCarlo method to estimate the second moment \eqref{eq:isingm2}, required to compute the partial gradients of the loglikelihood. Specifically, we use a GibbsSampler to sample from the conditional distribution and approximate the second moment in an importance sampling framework. This can be implemented quite efficiently while the estimation accuracy for the second moment is evaluated experimentaly which seems to be very reliable. simultaniously, we use the same approach to estimate the partition funciton. This though, is in comparison inaccurate, and may only be used to get a rough idea of the loglikelihood. Regardles, for our method, we only need the gradient for optimization where appropriate break conditions, not based on the likelihood, lead to a working method for MLE estimation.










\begin{figure}





\centering





\includegraphics[]{plots/simisingperftm2.pdf}





\caption{\label{fig:isingm2perft}Performance test for computing/estimating the second moment of the Ising model of dimension $p$ using ether the exact method or a MonteCarlo simulation.}





\end{figure}















\newpage





%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%





\section{Statistical Properties}










\subsection{Kronecker Product Manifolds}





\subsection{Kronecker Product Manifolds}\label{sec:kronmanifolds}





%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%










\cref{thm:sdr} identifies the sufficient reduction for the regression of $Y$ on $\ten{X}$ in the population. Any estimation of the sufficient reduction requires application of some optimality criterion. As we operate within the framework of the exponential family, we opted for maximum likelihood estimation (MLE). For the unconstrained problem, where the parameters are simply $\mat{B}$ and $\mat{\Omega}$ in \eqref{eq:eta1manifold}, maximizing the likelihood of $\ten{X} \mid Y$ is straightforward and yields welldefined MLEs of both parameters. Our setting, though, requires the constrained optimization of the $\ten{X} \mid Y$ likelihood subject to $\mat{B} = \bigotimes_{j = r}^{1}\mat{\beta}_j$ and $\mat{\Omega}=\bigkron_{j = r}^{1}\mat{\Omega}_j$. \Cref{thm:kronmanifolds,thm:parammanifold} provide the setting for which the MLE of the constrained parameter $\mat{\theta}$ is welldefined, which in turn leads to the derivation of its asymptotic normality.





\cref{thm:sdr} identifies the sufficient reduction for the regression of $Y$ on $\ten{X}$ in the population. Any estimation of the sufficient reduction requires application of some optimality criterion. As we operate within the framework of the exponential family, we opted for maximum likelihood estimation (MLE). For the unconstrained problem, where the parameters are simply $\mat{B}$ and $\mat{\Omega}$ in \eqref{eq:eta1manifold}, maximizing the likelihood of $\ten{X} \mid Y$ is straightforward and yields welldefined MLEs of both parameters. Our setting, though, requires the constrained optimization of the $\ten{X} \mid Y$ likelihood subject to $\mat{B} = \bigotimes_{j = r}^{1}\mat{\beta}_j$ and $\mat{\Omega}=\bigkron_{j = r}^{1}\mat{\Omega}_j$. \Cref{thm:kronmanifolds,thm:parammanifold} provide the setting for which the MLE of the constrained parameter $\mat{\theta}$ is welldefined, which in turn leads to the derivation of its asymptotic normality.










The main problem to be solves for asymtotic results of the MLE of the constraint parameter $\mat{\theta} = (\overline{\ten{\eta}}, \vec\mat{B}, \vech\mat{\Omega})$ derives from the nature of the constraint. We assumed that $\mat{B} = \bigkron_{k = r}^{1}\mat{\beta}_k$, where the parameter $\mat{B}$ is identifiable. This means that different values of $\mat{B}$ lead to different densities $f_{\mat{\theta}}(\ten{X}\mid Y = y)$, a basic property needed to ensure consistency of parameter estimates, which in turn is needed for asymptotic normality. On the other hand, the components $\mat{\beta}_j$ for $j = 1, \ldots, r$ are \emph{not} identifyable. A direct consequence of the identity $\mat{\beta}_2\otimes\mat{\beta}_1 = (c\mat{\beta}_2)\otimes (c^{1}\mat{\beta}_1)$ for every $c\neq 0$. This is the reason we formulated $\Theta$ as a constraint parameter space instead of parameterizing the densities of $\ten{X}\mid Y$ with respect to the components $\mat{\beta}_1, \ldots, \mat{\beta}_r$. The situation for $\mat{\Omega} = \bigkron_{k = r}^{1}\mat{\Omega}_k$ is the same.









@ 716,18 +799,49 @@ As a basis to ensure that the constraint parameter space $\Theta$ is a manifold,





\end{theorem}










\subsection{Matrix Manifolds}\label{sec:matrixmanifolds}





A powerful side effect of \cref{thm:parammanifold} is the modeling flexibinity it provides. For example, we can perform low rank regression. Or, we may constrain twoway interactions between direct axis neighbors by using band matrices for the $\mat{\Omega}_k$'s, among others.










{\color{red}





Flexibility of Manifold Structure





This flexibility derives from many different matrix manifolds that can be used as building blocks $\manifold{B}_k$ and $\manifold{O}_k$ of the parameter space $\Theta$ in \cref{thm:parammanifold}. A list of possible choices, among others, is given in \cref{tab:matrixmanifolds}. As long as parameters in $\Theta$ are valid paramererization of a density (or PMF) of \eqref{eq:quadraticexpfam} subject to \eqref{eq:eta1manifold} and \eqref{eq:eta2manifold}, one may choose any of the manifolds listed in \cref{tab:matrixmanifolds} which are ether cones or spherical. We also included an example which is nether a sphere nor a cone. They may also be valid building blocks, but require more work as they are not directly leading to a parameter manifold by \cref{thm:parammanifold}. In case one can show the resulting parameter space $\Theta$ is an embedded manifold, the asymtotic theory of \cref{sec:asymtotics} is applicable.










\begin{enumerate}





\item $\mat{\Omega}$ is a Kronecker product: Kronecker products of pos. def. matrices are elements of a manifold





\item $\mat{B}$ is a Kronecker product: any combination of $\mat{\beta}_j$ full rank, rank $r$, or semiorthogonal matrices would be an element of a manifold





\end{enumerate}





}





\begin{table}





\centering





\begin{tabular}{l  l  c c c}





Symbol & Description & C & S & Dimension\\ \hline





$\mathbb{R}^{p\times q}$ & All matrices of dimension $p\times q$ &





\checkmark & \xmark & $p q$ \\ \hline





& Full rank $p\times q$ matrices &





\checkmark & \xmark & $p q$ \\ \hline





$\mathrm{St}(p, q)$ & \emph{Stiefel Manifold}, $\{ \mat{U}\in\mathbb{R}^{p\times q} : \t{\mat{U}}\mat{U} = \mat{I}_q \}$ for $q\leq p$ &





\xmark & \checkmark & $p q  q (q + 1) / 2$ \\ \hline





$\mathcal{S}^{p  1}$ & Unit sphere in $\mathbb{R}^p$, special case $\Stiefel{p}{1}$ &





\xmark & \checkmark & $p  1$ \\ \hline





$U(p)$ & Unitary Group, special case $\Stiefel{p}{p}$ &





\xmark & \checkmark & $p (p  1) / 2$ \\ \hline





$SU(p)$ & Special Unitary Group $\{ \mat{U}\in U(p) : \det{\mat{U}} = 1 \}$ &





\xmark & \checkmark & $p (p  1) / 2$ \\ \hline





& Matrices of known rank $r > 0$, generalizes $\StiefelNonCompact{p}{q}$ &





\checkmark & \xmark & $r(p + q  r)$ \\ \hline





& Symmetric matrice &





\checkmark & \xmark & $\frac{1}{2}p(p+1)$ \\ \hline





$SPD(p)$ & Symmetric Positive Definite matrices &





\checkmark & \xmark & $\frac{1}{2}p(p+1)$ \\ \hline





& Scaled Identity $\{ a\mat{I}_p : a\in\mathbb{R}_{+} \}$ &





\checkmark & \xmark & $1$ \\ \hline





& Symmetric $r$band matrices (includes diagonal) &





\checkmark & \xmark & $(2 p  r) (r + 1) / 2$ \\





& $\qquad\{ \mat{A}\in\mathbb{R}^{p\times p} : \mat{A} = \t{\mat{A}}\land \mat{A}_{i j} = 0\ \forall i  j > r \}$ \\ \hline





& Auto correlation alike $\{ \mat{A}\in\mathbb{R}^{p\times p} : \mat{A}_{i j} = \rho^{i  j}, \rho\in(0, 1) \}$ &





\xmark & \xmark & $1$ \\ \hline





\end{tabular}





\caption{\label{tab:matrixmanifolds}Examples of embedded matrix manifolds. ``Symbol'' a (more or less) common notation for the matrix manifold, if at all. ``C'' stands for \emph{cone}, meaning it is scale invariant. ``S'' means \emph{spherical}, that is, constant Frobenius norm.}





\end{table}










\begin{remark}





The \emph{Grassmann Manifold} of $q$ dimensional subspaces in $\mathbb{R}^p$ is not listed in \cref{tab:matrixmanifolds} since it is not embedded in $\mathbb{R}^{p \times q}$.





\end{remark}















\subsection{Asymptotics}





\subsection{Asymptotics}\label{sec:asymtotics}










Let $Z$ be a random variable distributed according to a parameterized probability distribution with density $f_{\mat{\theta_0}}\in\{ f_{\mat{\theta}} : \mat{\theta}\in\Theta \}$ where $\Theta$ is a subset of Euclidean space. We want to estimate the parameter ${\mat{\theta}}_0$ using $n$ i.i.d. (independent and identically distributed) copies of $Z$. We assume a known, realvalued and measurable function $z\mapsto m_{\mat{\theta}}(z)$ for every $\mat{\theta}\in\Theta$ while ${\mat{\theta}}_0$ maximizes the map $\mat{\theta}\mapsto M(\mat{\theta}) = \E m_{\mat{\theta}}(Z)$ uniquely. For the estimation we maximize the empirical version





\begin{align}\label{eq:Mn}




@ 828,6 +942,12 @@ We compair our method with a few other methods for the tensor normal and the Isi





\end{figure}















\begin{figure}





\centering





\includegraphics[width = \textwidth]{plots/simtsir.pdf}





\caption{\label{fig:simtsir}Simulation to investiage the unexpected failure of TSIR in simulation 1c.}





\end{figure}















%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%





\subsection{Ising Model}




@ 1020,7 +1140,7 @@ as well as for any tensor $\ten{A}$ of even order $2 r$ and matching square matr















%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%





\section{Proofs}





\section{Proofs}\label{app:B}





%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%





\begin{proof}[Proof of \cref{thm:sdr}]\label{proof:sdr}





A direct implication of \textcite[Theorem~1]{sdrBuraDuarteForzani2016} is that, under the exponential family \eqref{eq:quadraticexpfam} with natural statistic \eqref{eq:tstat},




