fix: stability of gmlm_ising and added slicing,

add: higher-order approx.kron, add: paper.tex - simulation plots and tensor normal MLE estimation
2023-11-21 12:21:43 +01:00 · 2023-11-21 12:21:43 +01:00 · 399f878fbb
commit 399f878fbb
parent 6477d78190
8 changed files with 830 additions and 401 deletions
--- a/LaTeX/paper.tex
+++ b/LaTeX/paper.tex
@ -487,15 +487,15 @@ Equation \eqref{eq:eta2} is underdetermined since $\t{(\mat{T}_2\pinv{\mat{D}_p}
 \todo{Maybe already here introduce the ``constraint'' set of $\Omega$'s allowed as $\{ \Omega\in\mathbb{R}_{++}^{p\times p} : \vec{\Omega} = \t{(\pinv{(\mat{T}_2\pinv{\mat{D}_p})}\mat{T}_2\pinv{\mat{D}_p})}\vec{\Omega} \}$}
 In a classical \emph{generalized linear model} (GLM), the link function connecting the natural parameters to the expectation of the sufficient statistic $\mat{\eta}_y = \mat{g}(\E[\mat{t}(\ten{X}) \mid Y = y])$ is invertible. Such a link may not exist in our setting, but for our purpose what we call the ``inverse'' link suffices. The ``inverse'' link $\widetilde{\mat{g}}$ exists as the natural parameters fully describe the distribution. As in the non-minimal formulation \eqref{eq:quadratic-exp-fam}, we define the ``inverse'' link through its tensor valued components
-\begin{align*}
+\begin{align}
-    \ten{g}_1(\mat{\eta}_y) &= \E[\ten{X} \mid Y = y], \\
+    \ten{g}_1(\mat{\eta}_y) &= \E[\ten{X} \mid Y = y], \label{eq:inv-link1}\\
-    \ten{g}_2(\mat{\eta}_y) &= \E[\ten{X}\circ\ten{X} \mid Y = y]
+    \ten{g}_2(\mat{\eta}_y) &= \E[\ten{X}\circ\ten{X} \mid Y = y] \label{eq:inv-link2}
-\end{align*}
+\end{align}
 as $\widetilde{\mat{g}}(\mat{\eta}_y) = (\vec\ten{g}_1(\mat{\eta}_y), \vec\ten{g}_2(\mat{\eta}_y))$.
 Under the quadratic exponential family model \eqref{eq:quadratic-exp-fam},  a sufficient reduction for the regression of $Y$ on $\ten{X}$ is given in \cref{thm:sdr}.
 \begin{theorem}[SDR]\label{thm:sdr}
-    A sufficient reduction for the regression $Y\mid \ten{X}$ under the quadratic exponential family inverse regression model \eqref{eq:quadratic-exp-fam} with natural parameters \eqref{eq:eta1} and \eqref{eq:eta2} is given by
+    A sufficient reduction for the regression $Y\mid \ten{X}$ under the quadratic exponential family inverse regression model \eqref{eq:quadratic-exp-fam} with natural parameters modeled \eqref{eq:eta1} and \eqref{eq:eta2} is given by
    \begin{align}\label{eq:sdr}
        \ten{R}(\ten{X})
            % &= (\ten{X} - \E\ten{X})\times\{ \t{\mat{\beta}_1}, \ldots, \t{\mat{\beta}_r} \}.
@ -551,7 +551,7 @@ The maximum likelihood estimate of $\mat{\theta}_0$ is the solution to the optim
 \end{equation}
 with $\hat{\mat{\theta}}_n = (\vec\widehat{\overline{\ten{\eta}}}, \vec\widehat{\mat{B}}, \vech\widetilde{\mat{\Omega}})$ where $\widehat{\mat{B}} = \bigkron_{k = r}^{1}\widehat{\mat{\beta}}_k$ and $\widehat{\mat{\Omega}} = \bigkron_{k = r}^{1}\widehat{\mat{\Omega}}_k$.
-A straightforward and general method for parameter estimation is \emph{gradient descent}. For pure algorithmic speedup, by only changing the update rule but \emph{not} the gradient denoted $\nabla_{\mat{\theta}}l_n$ of the objective function, we use an adapted version of \emph{Nesterov Accelerated Gradient} descent (NAG) originaly described in \textcite{Nesterov1983} with an internal line search to determine the step size in each iteration, as described in \cref{alg:NAGD}. For applying gradient based optimization, we compute the gradients of $l_n$ in \cref{thm:grad}.
+A straightforward and general method for parameter estimation is \emph{gradient descent}. For applying gradient based optimization, we compute the gradients of $l_n$ in \cref{thm:grad}.
 \begin{theorem}\label{thm:grad}
    For $n$ i.i.d. observations $(\ten{X}_i, y_i), i = 1, ..., n$ the log-likelihood has the form \eqref{eq:log-likelihood} with $\mat{\theta}$ being the collection of all GMLM parameters $\overline{\ten{\eta}}$, ${\mat{B}} = \bigkron_{k = r}^{1}{\mat{\beta}}_k$ and ${\mat{\Omega}} = \bigkron_{k = r}^{1}{\mat{\Omega}}_k$ for $k = 1, ..., r$. Furthermore, let $\ten{G}_2(\mat{\eta}_y)$ be a tensor of dimensions $p_1, \ldots, p_r$ such that
@ -569,43 +569,84 @@ A straightforward and general method for parameter estimation is \emph{gradient
    If $\mat{T}_2$ is the identity matrix $\mat{I}_{p(p + 1) / 2}$, then $\ten{G}_2(\mat{\eta}_y) = \ten{g}_2(\mat{\eta}_y)$.
 \end{theorem}
-\begin{algorithm}[hpt!]
+Although the general case of any GMLM model can be fitted via gradient descent using \cref{thm:grad}, this may be very inefficient. In case of the tensor normal distribution an iterative cyclic updating scheme can be derived by setting the partial gradients of \cref{thm:grad} to zero. Described in more details in \cref{sec:tensor_normal_estimation}, this method has much faster convergence, is stabel and does not requires any hyper parameters. On the other hand, the Ising model does not allow such a scheme. There we need to use a gradient based method, which is the subject of \cref{sec:ising_estimation}.
    \caption{\label{alg:NAGD}Nesterov Accelerated Gradient Descent}
    \begin{algorithmic}[1]
        \State Objective: $l_n(\mat{\theta})$
        \State Arguments: Order $r + 1$ tensors $\ten{X}$, $\ten{F}$ \todo{ask Effie about input description}
        \State Initialize: Parameters $\mat{\theta}^{(0)}$, $0 < \delta^{(1)}$ and $0 < \gamma < 1$
        \\
        \State $t \leftarrow 1$
            \Comment{step counter}
        \State $\mat{\theta}^{(1)} \leftarrow \mat{\theta}^{(0)}$
            \Comment{artificial first step}
        \State $(m^{(0)}, m^{(1)}) \leftarrow (0, 1)$
            \Comment{momentum extrapolation weights}
        \\
        \Repeat \Comment{repeat untill convergence}
            \State $\mat{M} \leftarrow \mat{\theta}^{(t)} + \frac{m^{(t - 1)} - 1}{m^{(t)}}(\mat{\theta}^{(t)} - \mat{\theta}^{(t - 1)})$ \Comment{momentum extrapolation}
            \For{$\delta = \gamma^{-1}\delta^{(t)}, \delta^{(t)}, \gamma\delta^{(t)}, \gamma^2\delta^{(t)}, \ldots$} \Comment{line search}
                \State $\mat{\theta} \leftarrow \mat{M} + \delta \nabla l_n(\mat{M})$
                \If{$l_n(\mat{\theta}) \leq l_n(\mat{\theta}^{(t - 1)}) - \delta \|\nabla_{\mat{\theta}} l_n(\mat{M})\|_F^2$} \Comment{Armijo condition}
                    \State $\mat{\theta}^{(t + 1)} \leftarrow \mat{\theta}$
                    \State $\delta^{(t + 1)} \leftarrow \delta$
                    \Break
                \EndIf
            \EndFor
            \State $m^{(t + 1)} \leftarrow \frac{1 + \sqrt{1 + (2 m^{(t)})^2}}{2}$ \Comment{update extrapolation weights}
            \State $t \leftarrow t + 1$
        \Until converged
    \end{algorithmic}
 \end{algorithm}
-\todo{Update the following, especially check that $\ten{X}$ needs to be center (as in the current version implicitly assumed)}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\cref{alg:NAGD} requires initial parameter estimates $\mat{\theta}^{(0)}$. Therefore, we choose a simple heuristic approach by starting with $\mat{\Omega}_k^{(0)} = \mat{I}_{p_k}$ and $\overline{\ten{\eta}}_1^{(0)}$ dependent on the particular distribution. For example for a Tensor Normal distribution the sample mean is a good choice while for the Ising model zero initialization has nice properties. The interesting part are the multi-linear predictors $\mat{\beta}_k$. Therefore, let $\mat{\Sigma}_k = \frac{q_k}{n q}\sum_{i = 1}^n {\ten{F}_{y_i}}_{(k)}\t{{\ten{F}_{y_i}}_{(k)}}$ and $\mat{\Delta}_k = \frac{p_k}{n p}\sum_{i = 1}^n {\ten{X}_{i}}_{(k)}\t{{\ten{X}_{i}}_{(k)}}$ mode wise sample covariance estimates. For both we take the $s_k = \min(p_k, q_k)$ order SVD approximation as $\mat{\Delta}_k \approx \mat{U}_k\mat{D}_k\t{\mat{U}_k}$ and $\mat{\Sigma}_k \approx \mat{V}_k\mat{S}_k\t{\mat{V}_k}$. The approximate SVD gives us the $s_k$ first singular vectors with dimensions $\mat{U}_k(p_k\times s_k), \mat{D}_k(s_k\times s_k)$ and $\mat{V}_k(q_k\times s_k), \mat{S}_k(s_k\times s_k)$. Then we initialize
+\subsection{Tensor Normal}\label{sec:tensor_normal_estimation}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 With $\ten{X}\mid Y = y$ following a tensor normal distribution, its density, with mean $\ten{\mu}_y$ and covariance $\mat{\Sigma} = \bigkron_{k = r}^{1}\mat{\Sigma}_k$ is given by
 \begin{displaymath}
-    \mat{\beta}_k^{(0)} = \mat{U}_k\sqrt{\mat{D}_k}\sqrt{\mat{S}_k}\t{\mat{V}_k}.
+    f_{\mat{\theta}}(\ten{X}\mid Y = y) = (2\pi)^{-p / 2}\prod_{k = 1}^{r}\det(\mat{\Sigma}_k)^{-p / 2 p_k}\exp\left( -\frac{1}{2}\left\langle\ten{X} - \ten{\mu}_y, (\ten{X} - \ten{\mu}_y)\mlm_{k = 1}^{r}\mat{\Sigma}_k^{-1} \right\rangle \right).
 \end{displaymath}
 We assume the distribution is non-degenerate which means that the covariances $\mat{\Sigma}_k$ are symmetric positive definite matrices. For the sake of simplicity, we w.l.o.g. assume $\ten{X}$ to have marginal expectation equal zero, $\E\ten{X} = 0$. Rewriting this in the quadratic exponential family form \eqref{eq:quadratic-exp-fam}, determines tha scaling constant $c = -1/2$, and give the relation to the GMLM parameters $\overline{\ten{\eta}}, \mat{\beta}_k$ and $\mat{\Omega}_k$, for $k = 1, \ldots, r$ as
 \begin{displaymath}
    \ten{\mu}_y = \ten{F}_y\mlm_{k = 1}^{r}\mat{\Omega}_k^{-1}\mat{\beta}_k, \qquad
    \mat{\Omega}_k = \mat{\Sigma}_k^{-1}
 \end{displaymath}
 where we used that $\overline{\ten{\eta}} = 0$ due to $0 = \E\ten{X} = \E\E[\ten{X}\mid Y] = \E\ten{\mu}_Y$ in combination with $\E\ten{F}_Y = 0$. Additionally, all the $\mat{\Omega}_k$'s are symmetric positive definite, because the $\mat{\Sigma}_k$'s are. This lead to another simplification the matrix $\mat{T}_2$ from \eqref{eq:t-stat} is equal to the identity. This also means that the gradients of the log-likelihood $l_n$ in \cref{thm:grad} are simpler as well. We get
 \begin{displaymath}
    \ten{g}_1(\mat{\eta}_y) = \E[\ten{X}\mid Y = y] = \ten{\mu}_y, \qquad
    \ten{G}_2(\mat{\eta}_y) = \ten{g}_2(\mat{\eta}_y) = \E[\ten{X}\circ\ten{X}\mid Y = y] \equiv \bigkron_{k = r}^1\mat{\Sigma}_k + (\vec{\ten{\mu}}_y)\t{(\vec{\ten{\mu}}_y)}.
 \end{displaymath}
-The default hyper-parameters in \cref{alg:NAGD} are set to $\delta^{(1)} = 1 / 100$ for the initial step size and the line search step parameter $\gamma = 2 / (1 + \sqrt{5})$ is the inverse golden ratio.
+In practice, we start the estimation process by demeaning a given data set $(\ten{X}_i, \ten{F}_{y_i})$ with $i = 1, \ldots, n$ observations. After demenaing, only the reduction matrices $\mat{\beta}_k$ and the scatter matrices $\mat{\Omega}_k$ need to be estimated. Then, to solve the optimization problem \eqref{eq:mle}, with $\overline{\ten{\eta}} = 0$ after demeaning, we need some initial values. For initial estimates $\hat{\mat{\beta}}_k^{(0)}$ we use a simple heuristic approach. First, we compute moment based mode-wise marginal covariance estimates $\widehat{\mat{\Sigma}}_k(\ten{X})$ and $\widehat{\mat{\Sigma}}_k(\ten{F}_Y)$ given by
 \begin{displaymath}
    \widehat{\mat{\Sigma}}_k(\ten{X})   = \frac{1}{n}\sum_{i = 1}^{n} (\ten{X}_i)_{(k)}\t{(\ten{X}_i)_{(k)}}, \qquad
    \widehat{\mat{\Sigma}}_k(\ten{F}_Y) = \frac{1}{n}\sum_{i = 1}^{n} (\ten{F}_{y_i})_{(k)}\t{(\ten{F}_{y_i})_{(k)}}.
 \end{displaymath}
 Then, for every mode $k = 1, \ldots, r$, we compute the first $j = 1, \ldots, q_k$ eigen vectors $\mat{v}_j(\widehat{\mat{\Sigma}}_k(\ten{X}))$, $\mat{v}_j(\widehat{\mat{\Sigma}}_k(\ten{F}_Y))$ and eigen values $\lambda_j(\widehat{\mat{\Sigma}}_k(\ten{X}))$, $\lambda_j(\widehat{\mat{\Sigma}}_k(\ten{X}))$ of the marginal covariance estimates. Let,
 \begin{align*}
    \mat{U}_k &= (\mat{v}_1(\widehat{\mat{\Sigma}}_1(\ten{X})), \ldots, \mat{v}_{q_k}(\widehat{\mat{\Sigma}}_{q_k}(\ten{X}))), \\
    \mat{D}_k &= \diag(\mat{v}_1(\widehat{\mat{\Sigma}}_1(\ten{X}))\mat{v}_1(\widehat{\mat{\Sigma}}_1(\ten{F}_{Y})), \ldots, \mat{v}_{q_k}(\widehat{\mat{\Sigma}}_{q_k}(\ten{X}))\mat{v}_{q_k}(\widehat{\mat{\Sigma}}_k(\ten{F}_{Y}))), \\
    \mat{V}_k &= (\mat{v}_1(\widehat{\mat{\Sigma}}_1(\ten{F}_Y), \ldots, \mat{v}_{q_k}(\widehat{\mat{\Sigma}}_{q_k}(\ten{F}_Y)). \\
 \end{align*}
 The initial estimates are then set to
 \begin{displaymath}
    \hat{\mat{\beta}}_k^{(0)} = \mat{U}_k\sqrt{\mat{D}_k}\t{\mat{V}_k}.
 \end{displaymath}
 The initial values of $\mat{\Omega}_k$ are set to the identity $\mat{\Omega}_k^{(0)} = \mat{I}_{p_k}$.
 Given any estimates $\hat{\mat{\beta}}_1, \ldots, \hat{\mat{\beta}}_r, \hat{\mat{\Omega}}_1, \ldots, \hat{\mat{\Omega}}_r$, we take the gradient $\nabla_{\mat{\beta}_j}$ of the log-likelihood $l_n$ in \eqref{eq:log-likelihood} of the tensor normal distributed from \cref{thm:grad} and keep all other parameters except $\mat{\beta}_j$ fixed. Then, $\nabla_{\mat{\beta}_j}l_n = 0$ has a closed form solution
 \begin{equation}\label{eq:tensor_normal_beta_solution}
    \t{\mat{\beta}_j} = \biggl(
        \sum_{i = 1}^{n}
        \Bigl( \ten{F}_{y_i}\mlm_{k \neq j}\hat{\mat{\Omega}}_k^{-1}\hat{\mat{\beta}}_k \Bigr)_{(j)}
        \t{\Bigl( \ten{F}_{y_i}\mlm_{k \neq j}\hat{\mat{\beta}}_k \Bigr)_{(j)}}
    \biggr)^{-1}
    \biggl(
        \sum_{i = 1}^{n}
        \Bigl( \ten{F}_{y_i}\mlm_{k \neq j}\hat{\mat{\beta}}_k \Bigr)_{(j)}
        \t{(\ten{X}_{i})_{(j)}}
    \biggr)
        \hat{\mat{\Omega}}_j.
 \end{equation}
 For the scatter matrices $\mat{\Omega}_j$, we need to fudge a bit. Equating the partial gradient of the $j$'th scatter matrix $\mat{\Omega}_j$ in \cref{thm:grad} to zero, $\nabla_{\mat{\Omega}_j}l_n = 0$, gives a quadratic matrix equation. This is due to the dependence of $\ten{\mu}_y$ on $\mat{\Omega}_j$. In practive thoug, it is more stable, faster and equaly accurate to use mode-wise covariance estimates via the residuals
 \begin{displaymath}
    \hat{\ten{R}}_i = \ten{X}_i - \hat{\ten{\mu}}_{y_i} = \ten{X}_i - \ten{F}_{y_i}\mlm_{k = 1}^{r}\hat{\mat{\Omega}}_k^{-1}\hat{\mat{\beta}}_k.
 \end{displaymath} 
 The estimates are computed via the inbetween values
 \begin{displaymath}
    \tilde{\mat{\Sigma}}_j = \sum_{i = 1}^n (\hat{\ten{R}}_i)_{(j)} \t{(\hat{\ten{R}}_i)_{(j)}}
 \end{displaymath}
 which only lacks scaling $\tilde{s}\tilde{\mat{\Sigma}}_j = \hat{\mat{\Omega}}_j^{-1}$. The scaling condition is that the mean squared error has to be equal to the trace of the covariance estimate,
 \begin{displaymath}
    \frac{1}{n}\sum_{i = 1}^n \langle \hat{\ten{R}}_i, \hat{\ten{R}}_i \rangle = \tr\bigkron_{k = r}^{1}\hat{\mat{\Omega}}_k^{-1} = \prod_{k = 1}^{r}\tr{\hat{\mat{\Omega}}_k^{-1}} = \tilde{s}^r\prod_{k = 1}^{r}\tr{\tilde{\mat{\Sigma}}_k}.
 \end{displaymath}
 Solving for the scaling value gives
 \begin{displaymath}
    \tilde{s} = \biggl(\Bigl(\prod_{k = 1}^{r}\tr{\tilde{\mat{\Sigma}}_k}\Bigr)^{-1}\frac{1}{n}\sum_{i = 1}^n \langle \hat{\ten{R}}_i, \hat{\ten{R}}_i \rangle\biggr)^{1 / r}
 \end{displaymath}
 resulting in the estimates $\hat{\mat{\Omega}}_j = (\tilde{s}\tilde{\mat{\Sigma}}_j)^{-1}$.
 Estimation is then performed by updating for $j = 1, \ldots, r$ the estimates $\hat{\mat{\beta}}_j$ via \eqref{eq:tensor_normal_beta_solution}, then we recompute the $\hat{\mat{\Omega}}_j$ estimates simultaniously keeping the $\hat{\mat{\beta}}_j$'s fixed. This procedure is repeated untill convergence. Convergence is very fast, experiments showed that convergence occures usualy in less than $10$ iterations.
 A technical detail for numerical stability is to ensure that the scaled inbetween values $\tilde{s}\tilde{\mat{\Sigma}}_j$, assumed to be symmetric and positive definite, are well conditioned. Otherwise, the procedure can be numerically unstable by attempting to compute $\hat{\mat{\Omega}}_j = (\tilde{s}\tilde{\mat{\Sigma}}_j)^{-1}$. Therefore, we estimate the condition number of $\tilde{s}\tilde{\mat{\Sigma}}_j$ before computing the inverse. In case of being ill conditioned, we regularize and use $\hat{\mat{\Omega}}_j = (\tilde{s}\tilde{\mat{\Sigma}}_j + 0.2 \lambda_{1}(\tilde{s}\tilde{\mat{\Sigma}}_j)\mat{I}_{p_j})^{-1}$ instead, where $\lambda_{1}(\tilde{s}\tilde{\mat{\Sigma}}_j)$ is the first (maximum) eigen value. Experiments showed that this regularization is usually required in the first few iterations only.
 Furthermore, we may assume that the parameter space follows a more generel setting as in \cref{thm:param-manifold}. In this case, updating may produces estimates outside of the parameter space. A simple and efficient method is to project every updated estimate onto the corresponding manifold.
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{Ising Model}\label{sec:ising_estimation}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \newpage
@ -674,6 +715,8 @@ As a basis to ensure that the constraint parameter space $\Theta$ is a manifold,
    then the constrained parameter space $\Theta = \mathbb{R}^p \times \manifold{K}_{\mat{B}}\times\manifold{CK}_{\mat{\Omega}}\subset\mathbb{R}^{p(p + 2 q + 3) / 2}$ is a smooth embedded manifold.
 \end{theorem}
 \subsection{Matrix Manifolds}\label{sec:matrix-manifolds}
 {\color{red}
    Flexibility of Manifold Structure
@ -716,55 +759,6 @@ It is not necessary to have a perfect maximizer, as long as the objective has fi
    with asymptotic variance-covariance structure $\mat{\Sigma}_{\mat{\theta}_0}$ given in \eqref{eq:asymptotic-covariance-gmlm}.
 \end{theorem}
 % \begin{theorem}[{\textcite[Thm~5.35]{asymStats-van_der_Vaart1998}}]
 %     Let $\{ p_{\mat{\theta}} : \mat{\theta}\in\Theta \}$ be a collection of identifiable subprobability\footnote{A subprobability measure is a measure $\nu$ for the measure space $(\mathfrak{X}, \nu)$ such that $\nu(\mathfrak{X}) \leq 1$.} densities such that $P_{\mat{\theta}_0}$ is a probability measure. Then $M(\mat{\theta}) = P_{\mat{\theta}_0}\log p_{\mat{\theta}} / p_{\mat{\theta}_0}$ attains its maximum uniquely at $\mat{\theta}_0$.
 % \end{theorem}
 % \begin{proof}[Proof of \cref{thm:asymptotic-normality-gmlm}]
 %     First we prove the consistency for which we apply \cref{thm:consistency-on-metric-spaces} and then prove that there
 %     Most parts of the proof are identical to the proof \textcite[Thm~3.1]{asymptoticMLE-BuraEtAl2018} with addaptations in a few places.
 %     {\color{green}Bla Bla Bla ...}
 %     \todo{regularity conditions!}
 %     By rewriting \eqref{eq:eta1} as
 %     \begin{displaymath}
 %         \mat{\eta}_{1y} =
 %         \vec{\overline{\ten{\eta}}} + \mat{B}\vec{\ten{F}_y} = (\vec{\overline{\ten{\eta}}}, \mat{B})\begin{pmatrix}
 %             1 \\ \vec{\ten{F}_y}
 %         \end{pmatrix} = (\mat{I}_p\otimes \t{(1, \vec{\ten{F}_y})})\vec((\vec{\overline{\ten{\eta}}}, \mat{B}))
 %     \end{displaymath}
 %     the natural parameter $\mat{\eta}_y$ with components \eqref{eq:eta1}, \eqref{eq:eta2} can be written with $\mat{\theta} = (\vec{\overline{\ten{\eta}}}, \vec{\mat{B}}, \vec{\mat{\Omega}})$ as
 %     \begin{displaymath}
 %         \mat{\eta}_y = \begin{pmatrix}
 %             \mat{I}_p\otimes \t{(1, \vec{\ten{F}_y})} & 0 \\
 %             0 & c\t{(\pinv{(\mat{T}_2\pinv{\mat{D}_p})})}
 %         \end{pmatrix}\mat{\theta} = \mat{F}(y)\mat{\theta}
 %     \end{displaymath}
 %     Using the new form, let with $z = (\ten{X}, y)$
 %     \begin{displaymath}
 %         m_{\mat{\theta}}(z) = \langle \mat{F}(y)\mat{\theta}, \mat{t}(\ten{X}) \rangle - b(\mat{F}(y)\mat{\theta})
 %     \end{displaymath}
 %     which allows to write the log-likelihood \eqref{eq:log-likelihood}, scaled by $n^{-1}$, with $Z_i = (\ten{X}_i, Y_i)$, as
 %     \begin{displaymath}
 %         M_n(\mat{\theta}) = \frac{1}{n}\sum_{i = 1}^{n} m_{\mat{\theta}}(Z_i) = \frac{1}{n} l_n(\mat{\theta}).
 %     \end{displaymath}
 %     The strict convexity of $b$ ensures that $\mat{\theta}\mapsto m_{\mat{\theta}}(z)$ is strictly concave for every fixed $z$. Taking expectation preserves the concavity yielding $M(\mat{\theta}) = \E m_{\mat{\theta}}(Z)$ to be concave aswell. The identifiability \todo{???} of \todo{???} allows to apply \textcite[Thm~5.35]{asymStats-van_der_Vaart1998} which gives for the family of conditional densities $\{ f_{\mat{\theta}}(\,.\mid Y) : \mat{\theta}\in\Theta \}$ that
 %     \begin{displaymath}
 %         \E[m_{\mat{\theta}}(Z)\mid Y] \leq \E[m_{\mat{\theta}_0}(Z)\mid Y].
 %     \end{displaymath}
 %     Taking the expectation with respect to $Y$ gives $M(\mat{\theta}) = \E[m_{\mat{\theta}}(Z)] \leq \E[m_{\mat{\theta}_0}(Z)] = M(\mat{\theta}_0)$ for every $\mat{\theta}$. Now, assume their exists a $\mat{\theta}_1$ such that $M(\mat{\theta}_1) = M(\mat{\theta}_0)$, then $P(\mat{F}(Y)\mat{\theta}_1 = \mat{F}(Y)\mat{\theta}_0) = 1$ which means \todo{figure that out why} which contradicts \todo{$P(\mat{F}(Y)\mat{\theta} = \overline{\ten{\eta}}) = 1$ which is a violaten of an assumption.} Finally, with the integrability assumtion we can applying \textcite[Thm~A.1]{asymptoticMLE-BuraEtAl2018} which gives that the MLE $\hat{\mat{\theta}}_n$ of \eqref{eq:mle} exists and is unique.\todo{Fix that!!!}
 %     {\color{green}Foo Bar Baz ...}
 %     \todo{finish me!}
 % \end{proof}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{Asymptotic Normality}
@ -814,38 +808,96 @@ The following is a reformulation of \textcite[Lemma~2.3]{asymptoticMLE-BuraEtAl2
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 We compair our method with a few other methods for the tensor normal and the Ising model (inverse Ising problem).
-\subsection{Tensor Normal Distribution}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-The \emph{tensor normal distribution} $\mathcal{TN}(\ten{\mu}, \mat{\Sigma}_1, ..., \mat{\Sigma}_r)$ is a generalization of the \emph{matrix normal distribution} $\mathcal{MN}(\mat{\mu}, \mat{\Sigma}_1, \mat{\Sigma}_2)$. The density of the conditional tensor normal distribution $\ten{X}\mid Y = y$ is given by
+\subsection{Tensor Normal}
-\begin{displaymath}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    f_{\mat{\theta}_y}(\ten{X}\mid Y = y) = (2\pi)^{-p/2}\prod_{k = 1}^r |\mat{\Sigma}_{k}|^{-p / 2 p_{k}}\exp\Big(
        -\frac{1}{2}\Big\langle \ten{X} - \ten{\mu}_y, (\ten{X} - \ten{\mu}_y)\mlm_{k\in[r]}\mat{\Sigma}_{k} \Big\rangle
    \Big)
 \end{displaymath}
-\subsection{Ising Model}
+
 \begin{itemize}
    \item[a] A simple setup with linear relation for 
    \item[b] 
    \item[c] 
    \item[d] 
    \item[e] Missspecified model .......
 \end{itemize}
 \begin{figure}
    \centering
-    \begin{scaletikzpicturetowidth}{0.5\textwidth}
+    \includegraphics[width = \textwidth]{plots/sim-normal.pdf}
-        \input{plots/sim-normal.tex}
+    \caption{\label{fig:sim-normal}asknclknasknc}
    \end{scaletikzpicturetowidth}
    \caption{\label{fig:sim-normal}As a test case we show the $3$-tensor normal (generalization of the matrix normal)}
 \end{figure}
 \subsection{Matrix Normal and Ising Model}
-If $\mat{X} \in \mathbb{R}^{p_1 \times p_2}$, then $\mat{\theta} = (\overline{\eta}, \mat{\beta}_1, \mat{\beta}_2, \mat{\Omega}_1, \mat{\Omega}_2)$ and $\mat{F}_y$ is also matrix valued. The conditional pdf of $\mat{X}\mid Y$ is
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\begin{align*}
+\subsection{Ising Model}
-    f_{\mat{\theta}}(\mat{X}\mid Y = y)
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-        &= h(\mat{X})\exp(\langle\mat{X}, \mat{\eta}_1(\mat{\theta})\rangle + \langle\mat{X}\circ\mat{X}, \mat{\eta}_2(\mat{\theta})\rangle - b(\mat{\eta}_y(\mat{\theta}))) \\
+
-        &= h(\mat{X})\exp(\tr((\overline{\mat{\eta}} + \mat{\beta}_1\mat{F}_y\t{\mat{\beta}_2})\t{\mat{X}}) + \tr(\mat{\Omega}_1\mat{X}\mat{\Omega}_2\t{\mat{X}}) - b(\mat{\eta}_y(\mat{\theta}))).
+\begin{figure}
-\end{align*}
+    \centering
-The MLE estimate $\hat{\mat{\theta}}_n = (\widehat{\overline{\mat{\eta}}}, \widehat{\mat{\beta}}_2\otimes \widehat{\mat{\beta}}_1, \widehat{\mat{\Omega}}_2\otimes \widehat{\mat{\Omega}}_2)$ is asymptotically normal and
+    \includegraphics[]{plots/sim-ising.pdf}
-\begin{displaymath}
+    \caption{\label{fig:sim-ising}asknclknasknc}
-    \widehat{\ten{R}}(\mat{X}) = \t{(\widehat{\mat{\beta}}_2\otimes \widehat{\mat{\beta}}_1)}\vec(\mat{X} - \E\mat{X}) \equiv \t{\widehat{\mat{\beta}}_1}(\mat{X} - \E\mat{X})\widehat{\mat{\beta}}_2
+\end{figure}
-\end{displaymath}
+
-is the MLE of the sufficient reduction $\ten{R}(\mat{X})$ of dimension $q_1 \times q_2 \leq p_1 \times p_2 $.\\[1.6em]
+\begin{table}
    \begin{tabular}{c | ccc ccc c}
        $n$ &             GMLM  &         PCA &       HOPCA &       LPCA  &       CLPCA &        TSIR &       MGCCA \\
        \hline
        100 & {\bf 0.34} (0.14) & 0.90 (0.04) & 0.90 (0.05) & 0.94 (0.09) & 0.91 (0.03) & 0.48 (0.19) & 0.55 (0.13) \\
        200 & {\bf 0.25} (0.11) & 0.90 (0.03) & 0.90 (0.03) & 0.96 (0.07) & 0.91 (0.02) & 0.38 (0.16) & 0.53 (0.10) \\
        300 & {\bf 0.20} (0.09) & 0.89 (0.02) & 0.89 (0.02) & 0.97 (0.06) & 0.91 (0.02) & 0.29 (0.13) & 0.51 (0.11) \\
        500 & {\bf 0.16} (0.07) & 0.90 (0.02) & 0.90 (0.02) & 0.98 (0.01) & 0.91 (0.01) & 0.23 (0.10) & 0.50 (0.08) \\
        750 & {\bf 0.13} (0.05) & 0.90 (0.01) & 0.90 (0.01) & 0.98 (0.02) & 0.91 (0.01) & 0.23 (0.08) & 0.53 (0.06)
    \end{tabular}
    \caption{\label{tab:sim-ising}xyz uvw}
 \end{table}
 % The \emph{tensor normal distribution} $\mathcal{TN}(\ten{\mu}, \mat{\Sigma}_1, ..., \mat{\Sigma}_r)$ is a generalization of the \emph{matrix normal distribution} $\mathcal{MN}(\mat{\mu}, \mat{\Sigma}_1, \mat{\Sigma}_2)$.  \todo{ref} The density of the conditional tensor normal distribution $\ten{X}\mid Y = y$ according to the quadratic exponential family \eqref{eq:quadratic-exp-fam} where only the first moment depends on $y$ is given by
 % \begin{displaymath}
 %     f_{\mat{\theta}}(\ten{X}\mid Y = y) = (2\pi)^{-p/2}\prod_{k = 1}^r |\mat{\Sigma}_{k}|^{-p / 2 p_{k}}\exp\Big(
 %         -\frac{1}{2}\Big\langle \ten{X} - \ten{\mu}_y, (\ten{X} - \ten{\mu}_y)\mlm_{k\in[r]}\mat{\Sigma}_{k} \Big\rangle
 %     \Big)
 % \end{displaymath}
 % Rewriting this in the form of an exponential family as in \eqref{eq:quadratic-exp-fam} allows to determin the natural parameter components $\mat{\eta}_{y1}$ and $\mat{\eta}_2$. Since a non-degenerate normal distribution requires the covariance matrices $\mat{\Sigma}_k$ to be symmetric positive definite the relation to the second moment natural parameter $\mat{\eta}_2$ simplifies as we can set $\mat{T}_2$ in \eqref{eq:eta2-manifold} to the identity. This then gives the relation to the natural parameters as in \eqref{eq:eta1-manifold} and \eqref{eq:eta2-manifold} as
 % \begin{displaymath}
 %     \mat{\eta}_{1y} = \vec\Bigl(\ten{\mu}_y\mlm_{k = 1}^{r}\mat{\Sigma}_k\Bigr), \qquad
 %     \mat{\eta}_2    = c\t{\mat{D}_p}\vec\bigkron_{k = r}^{1}\mat{\Sigma}_k^{-1}
 % \end{displaymath}
 % with scaling constant $c = -1 / 2$. Modeling the natural parameters as in \eqref{eq:eta1} and \eqref{eq:eta2} relates the mean $\ten{\mu}_y$ and the covariance matrices $\mat{\Sigma}_k$ of the tensor normal to the generalized multi-linear model parameter $\overline{\ten{\eta}}$ and $\mat{\beta}_k$, $\mat{\Omega}_k$, for $k = 1, \ldots, r$ through
 % \begin{displaymath}
 %     \ten{\mu}_y = \Bigl(\overline{\ten{\eta}} + \ten{F}_y\mlm_{j = 1}^{r}\mat{\beta}_j\Bigr)\mlm_{k = 1}^{r}\mat{\Omega}_k^{-1}, \qquad \mat{\Omega}_k = \mat{\Sigma}_k^{-1}.
 % \end{displaymath}
 % This completely determines the tensor normal distribution given the GMLM parameter.
 % To estimate the GMLM parameters $\mat{\theta} = (\overline{\ten{\eta}}, \mat{\beta}_1, \ldots, \mat{\beta}_r, \mat{\Omega}_1, \ldots\mat{\Omega}_r)$ given a data set $(\ten{X}_i, y_i)$ of $i = 1, \ldots, n$ observation we use the gradients provided by \cref{thm:grad}. It turns out that the equations $\nabla_{\overline{\ten{\eta}}}l_n = 0, \nabla_{\mat{\beta}_j}l_n = 0$ and $\nabla_{\mat{\Omega}_j}l_n = 0$, for $j = 1, \ldots, r$, can be solved for the differentiation variable assuming all the other parameter blocks to be constant. Centering the observed $\ten{X}$ leads to a cleaner formulation given by  \todo{fix the following!}
 % \begin{align*}
 %     \hat{\overline{\ten{\eta}}} &= \frac{1}{n}\sum_{i = 1}^{n} \ten{X}_i, \\
 %     \t{\hat{\mat{\beta}}_j} &=  \mat{\Omega}_j \Bigl(\Bigl(\ten{F}_{y_i}\mlm_{k \neq j}\mat{\Omega}_k^{-1}\mat{\beta}_k\Bigr)_{(j)}\t{\Bigl(\ten{F}_{y_i}\mlm_{k \neq j}\mat{\beta}_k\Bigr)_{(j)}}\Bigr)^{-1}\Bigl(\ten{F}_{y_i}\mlm_{k \neq j}\mat{\beta}_k\Bigr)_{(j)}\ten{X}_{(j)}, \\
 %     \t{\Omega}_j &= scaling \frac{1}{n}
 % \end{align*}
 % This allows to use a \emph{block coordinate descent} method instead of gradient descent. This method keeps all but one parameter block fixed and optimized the objective for a single block. Given a closed form solution for the partial gradients, only a single update is required to solve the partial optimization problem. This means that the block coordinate descent method reduces to a cyclic updating. This not only converges very fast, it also does not require any hyper parameters.
 % For this iterative scheme we do need some initial estimates. For those we 
 % \subsection{Matrix Normal and Ising Model}
 % If $\mat{X} \in \mathbb{R}^{p_1 \times p_2}$, then $\mat{\theta} = (\overline{\eta}, \mat{\beta}_1, \mat{\beta}_2, \mat{\Omega}_1, \mat{\Omega}_2)$ and $\mat{F}_y$ is also matrix valued. The conditional pdf of $\mat{X}\mid Y$ is
 % \begin{align*}
 %     f_{\mat{\theta}}(\mat{X}\mid Y = y)
 %         &= h(\mat{X})\exp(\langle\mat{X}, \mat{\eta}_1(\mat{\theta})\rangle + \langle\mat{X}\circ\mat{X}, \mat{\eta}_2(\mat{\theta})\rangle - b(\mat{\eta}_y(\mat{\theta}))) \\
 %         &= h(\mat{X})\exp(\tr((\overline{\mat{\eta}} + \mat{\beta}_1\mat{F}_y\t{\mat{\beta}_2})\t{\mat{X}}) + \tr(\mat{\Omega}_1\mat{X}\mat{\Omega}_2\t{\mat{X}}) - b(\mat{\eta}_y(\mat{\theta}))).
 % \end{align*}
 % The MLE estimate $\hat{\mat{\theta}}_n = (\widehat{\overline{\mat{\eta}}}, \widehat{\mat{\beta}}_2\otimes \widehat{\mat{\beta}}_1, \widehat{\mat{\Omega}}_2\otimes \widehat{\mat{\Omega}}_2)$ is asymptotically normal and
 % \begin{displaymath}
 %     \widehat{\ten{R}}(\mat{X}) = \t{(\widehat{\mat{\beta}}_2\otimes \widehat{\mat{\beta}}_1)}\vec(\mat{X} - \E\mat{X}) \equiv \t{\widehat{\mat{\beta}}_1}(\mat{X} - \E\mat{X})\widehat{\mat{\beta}}_2
 % \end{displaymath}
 % is the MLE of the sufficient reduction $\ten{R}(\mat{X})$ of dimension $q_1 \times q_2 \leq p_1 \times p_2 $.\\[1.6em]
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \appendix
--- a/LaTeX/plots/sim-ising.tex
+++ b/LaTeX/plots/sim-ising.tex
@ -0,0 +1,151 @@
 %%% R code to generate the input data files from corresponding simulation logs
 % R> setwd("~/Work/tensorPredictors")
 % R>
 % R> for (sim.name in c("2a")) {
 % R>     pattern <- paste0("sim\\_", sim.name, "\\_ising\\-[0-9T]*\\.csv")
 % R>     log.file <- sort(
 % R>         list.files(path = "sim/", pattern = pattern, full.names = TRUE),
 % R>         decreasing = TRUE
 % R>     )[[1]]
 % R>
 % R>     sim <- read.csv(log.file)
 % R>
 % R>     aggr <- aggregate(sim[, names(sim) != "sample.size"], list(sample.size = sim$sample.size), mean)
 % R>
 % R>     write.table(aggr, file = paste0("LaTeX/plots/aggr-", sim.name, "-ising.csv"), row.names = FALSE, quote = FALSE)
 % R> }
 \documentclass[border=0cm]{standalone}
 \usepackage{tikz}
 \usepackage{pgfplots}
 \usepackage{bm}
 \definecolor{gmlm}{RGB}{0,0,0}
 \definecolor{mgcca}{RGB}{86,180,233}
 \definecolor{tsir}{RGB}{0,158,115}
 \definecolor{pca}{RGB}{240,228,66}
 \definecolor{hopca}{RGB}{230,159,0}
 \definecolor{lpca}{RGB}{127,127,127}
 \definecolor{clpca}{RGB}{191,191,191}
 \pgfplotsset{
    every axis/.style={
        xtick={100,200,300,500,750},
        ymin=-0.05, ymax=1.05,
        grid=both,
        grid style={gray, dotted}
    },
    every axis plot/.append style={
        mark = *,
        mark size = 1pt,
        line width=0.8pt
    }
 }
 \tikzset{
    legend entry/.style={
        mark = *,
        mark size = 1pt,
        mark indices = {2},
        line width=0.8pt
    }
 }
 \begin{document}
 \begin{tikzpicture}[>=latex]
    \begin{axis}[
        name=sim-2a
    ]
        \addplot[color = gmlm, line width=1pt]  table[x = sample.size, y = dist.subspace.gmlm]  {aggr-1a-normal.csv};
        \addplot[color = pca]   table[x = sample.size, y = dist.subspace.pca]   {aggr-2a-ising.csv};
        \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-2a-ising.csv};
        \addplot[color = tsir]  table[x = sample.size, y = dist.subspace.tsir]  {aggr-2a-ising.csv};
        \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-2a-ising.csv};
        \addplot[color = lpca]  table[x = sample.size, y = dist.subspace.lpca]  {aggr-2a-ising.csv};
        \addplot[color = clpca] table[x = sample.size, y = dist.subspace.clpca] {aggr-2a-ising.csv};
    \end{axis}
    \node[anchor = base west, yshift = 0.3em] at (sim-2a.north west) {
        a: small
    };
    % \begin{axis}[
    %     name=sim-1b,
    %     anchor=north west, at={(sim-2a.right of north east)}, xshift=0.1cm,
    %     xticklabel=\empty,
    %     ylabel near ticks, yticklabel pos=right
    % ]
    %     \addplot[color = pca]   table[x = sample.size, y = dist.subspace.pca]   {aggr-1b-normal.csv};
    %     \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1b-normal.csv};
    %     \addplot[color = tsir]  table[x = sample.size, y = dist.subspace.tsir]  {aggr-1b-normal.csv};
    %     \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1b-normal.csv};
    %     \addplot[color = gmlm, line width=1pt]  table[x = sample.size, y = dist.subspace.gmlm]  {aggr-1b-normal.csv};
    % \end{axis}
    % \node[anchor = base west, yshift = 0.3em] at (sim-1b.north west) {
    %     b: cubic dependence on $y$
    % };
    % \begin{axis}[
    %     name=sim-1c,
    %     anchor=north west, at={(sim-2a.below south west)}, yshift=-.8em,
    %     xticklabel=\empty
    % ]
    %     \addplot[color = pca]   table[x = sample.size, y = dist.subspace.pca]   {aggr-1c-normal.csv};
    %     \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1c-normal.csv};
    %     \addplot[color = tsir]  table[x = sample.size, y = dist.subspace.tsir]  {aggr-1c-normal.csv};
    %     \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1c-normal.csv};
    %     \addplot[color = gmlm, line width=1pt]  table[x = sample.size, y = dist.subspace.gmlm]  {aggr-1c-normal.csv};
    % \end{axis}
    % \node[anchor = base west, yshift = 0.3em] at (sim-1c.north west) {
    %     c: rank $1$ $\boldsymbol{\beta}$'s
    % };
    % \begin{axis}[
    %     name=sim-1d,
    %     anchor=north west, at={(sim-1c.right of north east)}, xshift=0.1cm,
    %     ylabel near ticks, yticklabel pos=right
    % ]
    %     \addplot[color = pca]   table[x = sample.size, y = dist.subspace.pca]   {aggr-1d-normal.csv};
    %     \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1d-normal.csv};
    %     \addplot[color = tsir]  table[x = sample.size, y = dist.subspace.tsir]  {aggr-1d-normal.csv};
    %     \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1d-normal.csv};
    %     \addplot[color = gmlm, line width=1pt]  table[x = sample.size, y = dist.subspace.gmlm]  {aggr-1d-normal.csv};
    % \end{axis}
    % \node[anchor = base west, yshift = 0.3em] at (sim-1d.north west) {
    %     d: tri-diagonal $\boldsymbol{\Omega}$'s
    % };
    % \begin{axis}[
    %     name=sim-1e,
    %     anchor=north west, at={(sim-1c.below south west)}, yshift=-.8em
    % ]
    %     \addplot[color = pca]   table[x = sample.size, y = dist.subspace.pca]   {aggr-1e-normal.csv};
    %     \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1e-normal.csv};
    %     \addplot[color = tsir]  table[x = sample.size, y = dist.subspace.tsir]  {aggr-1e-normal.csv};
    %     \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1e-normal.csv};
    %     \addplot[color = gmlm, line width=1pt]  table[x = sample.size, y = dist.subspace.gmlm]  {aggr-1e-normal.csv};
    % \end{axis}
    % \node[anchor = base west, yshift = 0.3em] at (sim-1e.north west) {
    %     e: missspecified
    % };
    \matrix[anchor = west] at (sim-2a.right of east) {
        \draw[color=gmlm,  legend entry, line width=1pt] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {GMLM};  \\
        \draw[color=tsir,  legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {TSIR};  \\
        \draw[color=mgcca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {MGCCA}; \\
        \draw[color=hopca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {HOPCA}; \\
        \draw[color=pca,   legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {PCA};   \\
        \draw[color=lpca,  legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {LPCA};   \\
        \draw[color=clpca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {CLPCA};   \\
    };
    \node[anchor = north] at (current bounding box.south) {Sample Size $n$};
    \node[anchor = south, rotate = 90] at (current bounding box.west) {Subspace Distance $d(\boldsymbol{B}, \hat{\boldsymbol{B}})$};
    \node[anchor = south, rotate = 270] at (current bounding box.east) {\phantom{Subspace Distance $d(\boldsymbol{B}, \hat{\boldsymbol{B}})$}};
    \node[anchor = south, font=\large] at (current bounding box.north) {Tensor Normal Simulation};
 \end{tikzpicture}
 \end{document}
--- a/LaTeX/plots/sim-normal.tex
+++ b/LaTeX/plots/sim-normal.tex
@ -0,0 +1,148 @@
 %%% R code to generate the input data files from corresponding simulation logs
 % R> setwd("~/Work/tensorPredictors")
 % R>
 % R> for (sim.name in c("1a", "1b", "1c", "1d", "1e")) {
 % R>     pattern <- paste0("sim\\_", sim.name, "\\_normal\\-[0-9T]*\\.csv")
 % R>     log.file <- sort(
 % R>         list.files(path = "sim/", pattern = pattern, full.names = TRUE),
 % R>         decreasing = TRUE
 % R>     )[[1]]
 % R>
 % R>     sim <- read.csv(log.file)
 % R>
 % R>     aggr <- aggregate(sim[, names(sim) != "sample.size"], list(sample.size = sim$sample.size), mean)
 % R>
 % R>     write.table(aggr, file = paste0("LaTeX/plots/aggr-", sim.name, "-normal.csv"), row.names = FALSE, quote = FALSE)
 % R> }
 \documentclass[border=0cm]{standalone}
 \usepackage{tikz}
 \usepackage{pgfplots}
 \usepackage{bm}
 \definecolor{gmlm}{RGB}{0,0,0}
 \definecolor{mgcca}{RGB}{86,180,233}
 \definecolor{tsir}{RGB}{0,158,115}
 \definecolor{hopca}{RGB}{230,159,0}
 \definecolor{pca}{RGB}{240,228,66}
 \definecolor{lpca}{RGB}{0,114,178}
 \definecolor{clpca}{RGB}{213,94,0}
 \pgfplotsset{
    every axis/.style={
        xtick={100,200,300,500,750},
        ymin=-0.05, ymax=1.05,
        grid=both,
        grid style={gray, dotted}
    },
    every axis plot/.append style={
        mark = *,
        mark size = 1pt,
        line width=0.8pt
    }
 }
 \tikzset{
    legend entry/.style={
        mark = *,
        mark size = 1pt,
        mark indices = {2},
        line width=0.8pt
    }
 }
 \begin{document}
 \begin{tikzpicture}[>=latex]
    \begin{axis}[
        name=sim-1a,
        xticklabel=\empty
    ]
        \addplot[color = gmlm, line width=1pt]  table[x = sample.size, y = dist.subspace.gmlm]  {aggr-1a-normal.csv};
        \addplot[color = pca]   table[x = sample.size, y = dist.subspace.pca]   {aggr-1a-normal.csv};
        \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1a-normal.csv};
        \addplot[color = tsir]  table[x = sample.size, y = dist.subspace.tsir]  {aggr-1a-normal.csv};
        \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1a-normal.csv};
    \end{axis}
    \node[anchor = base west, yshift = 0.3em] at (sim-1a.north west) {
        a: linear dependence on $\mathcal{F}_y \equiv y$
    };
    \begin{axis}[
        name=sim-1b,
        anchor=north west, at={(sim-1a.right of north east)}, xshift=0.1cm,
        xticklabel=\empty,
        ylabel near ticks, yticklabel pos=right
    ]
        \addplot[color = pca]   table[x = sample.size, y = dist.subspace.pca]   {aggr-1b-normal.csv};
        \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1b-normal.csv};
        \addplot[color = tsir]  table[x = sample.size, y = dist.subspace.tsir]  {aggr-1b-normal.csv};
        \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1b-normal.csv};
        \addplot[color = gmlm, line width=1pt]  table[x = sample.size, y = dist.subspace.gmlm]  {aggr-1b-normal.csv};
    \end{axis}
    \node[anchor = base west, yshift = 0.3em] at (sim-1b.north west) {
        b: cubic dependence on $y$
    };
    \begin{axis}[
        name=sim-1c,
        anchor=north west, at={(sim-1a.below south west)}, yshift=-.8em,
        xticklabel=\empty
    ]
        \addplot[color = pca]   table[x = sample.size, y = dist.subspace.pca]   {aggr-1c-normal.csv};
        \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1c-normal.csv};
        \addplot[color = tsir]  table[x = sample.size, y = dist.subspace.tsir]  {aggr-1c-normal.csv};
        \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1c-normal.csv};
        \addplot[color = gmlm, line width=1pt]  table[x = sample.size, y = dist.subspace.gmlm]  {aggr-1c-normal.csv};
    \end{axis}
    \node[anchor = base west, yshift = 0.3em] at (sim-1c.north west) {
        c: rank $1$ $\boldsymbol{\beta}$'s
    };
    \begin{axis}[
        name=sim-1d,
        anchor=north west, at={(sim-1c.right of north east)}, xshift=0.1cm,
        ylabel near ticks, yticklabel pos=right
    ]
        \addplot[color = pca]   table[x = sample.size, y = dist.subspace.pca]   {aggr-1d-normal.csv};
        \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1d-normal.csv};
        \addplot[color = tsir]  table[x = sample.size, y = dist.subspace.tsir]  {aggr-1d-normal.csv};
        \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1d-normal.csv};
        \addplot[color = gmlm, line width=1pt]  table[x = sample.size, y = dist.subspace.gmlm]  {aggr-1d-normal.csv};
    \end{axis}
    \node[anchor = base west, yshift = 0.3em] at (sim-1d.north west) {
        d: tri-diagonal $\boldsymbol{\Omega}$'s
    };
    \begin{axis}[
        name=sim-1e,
        anchor=north west, at={(sim-1c.below south west)}, yshift=-.8em
    ]
        \addplot[color = pca]   table[x = sample.size, y = dist.subspace.pca]   {aggr-1e-normal.csv};
        \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1e-normal.csv};
        \addplot[color = tsir]  table[x = sample.size, y = dist.subspace.tsir]  {aggr-1e-normal.csv};
        \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1e-normal.csv};
        \addplot[color = gmlm, line width=1pt]  table[x = sample.size, y = dist.subspace.gmlm]  {aggr-1e-normal.csv};
    \end{axis}
    \node[anchor = base west, yshift = 0.3em] at (sim-1e.north west) {
        e: missspecified
    };
    \matrix[anchor = center] at (sim-1e.right of east -| sim-1d.south) {
        \draw[color=gmlm,  legend entry, line width=1pt] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {GMLM};  \\
        \draw[color=tsir,  legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {TSIR};  \\
        \draw[color=mgcca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {MGCCA}; \\
        \draw[color=hopca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {HOPCA}; \\
        \draw[color=pca,   legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {PCA};   \\
    };
    \node[anchor = north] at (current bounding box.south) {Sample Size $n$};
    \node[anchor = south, rotate = 90] at (current bounding box.west) {Subspace Distance $d(\boldsymbol{B}, \hat{\boldsymbol{B}})$};
    \node[anchor = south, rotate = 270] at (current bounding box.east) {\phantom{Subspace Distance $d(\boldsymbol{B}, \hat{\boldsymbol{B}})$}};
    \node[anchor = south, font=\large] at (current bounding box.north) {Tensor Normal Simulation};
 \end{tikzpicture}
 \end{document}
--- a/tensorPredictors/NAMESPACE
+++ b/tensorPredictors/NAMESPACE
@ -11,8 +11,8 @@ export(.projRank)
 export(.projSymBand)
 export(.projSymRank)
 export(CISE)
-export(D)
+export(Dup)
-export(D.pinv)
+export(Dup.pinv)
 export(GMLM)
 export(GMLM.default)
 export(HOPCA)
@ -32,6 +32,7 @@ export(RMReg)
 export(RMap)
 export(S)
 export(TSIR)
 export(approx.kron)
 export(approx.kronecker)
 export(colKronecker)
 export(decompose.kronecker)
@ -41,6 +42,7 @@ export(dist.projection)
 export(dist.subspace)
 export(exprs.all.equal)
 export(gmlm_ising)
 export(gmlm_mlm)
 export(gmlm_tensor_normal)
 export(icu_tensor_normal)
 export(ising_m2)
@ -72,6 +74,7 @@ export(reduce)
 export(riccati)
 export(rowKronecker)
 export(rtensornorm)
 export(slice.assign.expr)
 export(slice.expr)
 export(slice.select)
 export(sylvester)
--- a/tensorPredictors/R/approx_kronecker.R
+++ b/tensorPredictors/R/approx_kronecker.R
@ -1,3 +1,31 @@
 #' @examples
 #' nrows <- c(3, 2, 5)
 #' ncols <- c(2, 4, 8)
 #'
 #' A <- Reduce(kronecker, Map(matrix, Map(rnorm, nrows * ncols), nrows))
 #'
 #' all.equal(A, Reduce(kronecker, approx.kron(A, nrows, ncols)))
 #'
 #' @export
 approx.kron <- function(A, nrows, ncols) {
    # rearrange kronecker product `A` into outer product `R`
    dim(A) <- c(rev(nrows), rev(ncols))
    axis.perm <- as.vector(t(matrix(seq_along(dim(A)), ncol = 2L))[, rev(seq_along(nrows))])
    R <- aperm(A, axis.perm, resize = FALSE)
    dim(R) <- nrows * ncols
    # scaling factor for every product component
    s <- sum(A^2)^(1 / length(dim(A)))
    # higher order SVD on `R` with one singular vector
    Map(function(mode, nr, nc) {
        s * `dim<-`(La.svd(mcrossprod(R, mode = mode), 1L, 0L)$u, c(nr, nc))
    }, seq_along(nrows), nrows, ncols)
 }
 #' Approximates Kronecker Product decomposition.
 #'
 #' Approximates the matrices `A` and `B` such that
--- a/tensorPredictors/R/gmlm_ising.R
+++ b/tensorPredictors/R/gmlm_ising.R
@ -1,42 +1,110 @@
 #' Specialized version of the GMLM for the Ising model (inverse Ising problem)
 #'
 #' @todo TODO: Add beta and Omega projections
 #'
 #' @export
 gmlm_ising <- function(X, F, sample.axis = length(dim(X)),
    # proj.betas = ..., proj.Omegas = ...,  # TODO: this
    max.iter = 1000L,
    eps = sqrt(.Machine$double.eps),
-    step.size = function(iter) 1e-2 * (1000 / (iter + 1000)),
+    step.size = 1e-3,
-    zig.zag.threashold = 5L,
+    zig.zag.threashold = 20L,
-    patience = 5L,
+    patience = 3L,
-    nr.slices = 10L,                    # only for univariate `F(y) = y`
+    nr.slices = 20L,                            # only for univariate `F(y) = y`
-    slice.method = c("cut", "ecdf"),    # only for univariate `F(y) = y` and `y` is a factor or integer
+    slice.method = c("cut", "ecdf", "none"),    # only for univariate `F(y) = y` and `y` is a factor or integer
    logger = function(...) { }
 ) {
-    # # Special case for univariate response `vec F(y) = y`
+    # Get problem dimensions
-    # # Due to high computational costs we use slicing
+    dimX <- dim(X)[-sample.axis]
-    # if (length(F) == prod(dim(F))) {
+    # threat scalar `F` as a tensor
-    #     y <- as.vector(F)
+    if (is.null(dim(F))) {
-    #     if (!(is.factor(y) || is.integer(y))) {
+        dimF <- rep(1L, length(dimX))
-    #         slice.method <- match.arg(slice.method)
+        dim(F) <- ifelse(seq_along(dim(X)) == sample.axis, sample.size, 1L)
-    #         if (slice.method == "ecdf") {
+    } else {
-    #             y <- cut(ecdf(y)(y), nr.slices)
+        dimF <- dim(F)[-sample.axis]
-    #         } else {
+    }
-    #             y <- cut(y, nr.slices)
+    sample.size <- dim(X)[sample.axis]
    #         }
    #     }
    #     slices <- split(seq_len(sample.size), y, drop = TRUE)
    # } else {
    #     slices <- seq_len(sample.size)
    # }
-    dimX <- head(dim(X), -1)
+    # rearrange `X`, `F` such that the last axis enumerates observations
-    dimF <- head(dim(F), -1)
+    if (sample.axis != length(dim(X))) {
-    sample.axis <- length(dim(X))
+        axis.perm <- c(seq_along(dim(X))[-sample.axis], sample.axis)
-    modes <- seq_len(length(dim(X)) - 1)
+        X <- aperm(X, axis.perm)
-    sample.size <- tail(dim(X), 1)
+        F <- aperm(F, axis.perm)
        sample.axis <- length(dim(X))
    }
    modes <- seq_along(dimX)
-    betas <- Map(matrix, Map(rnorm, dimX * dimF), dimX)
+    # Special case for univariate response `vec F(y) = y`
-    Omegas <- Map(diag, dimX)
+    # Due to high computational costs we use slicing
    slice.method <- match.arg(slice.method)
    slices.ind <- if ((slice.method != "none") && (length(F) == prod(dim(F)))) {
        y <- as.vector(F)
        if (!(is.factor(y) || is.integer(y))) {
            slice.method <- match.arg(slice.method)
            if (slice.method == "ecdf") {
                y <- cut(ecdf(y)(y), nr.slices)
            } else {
                y <- cut(y, nr.slices)
            }
        }
        split(seq_len(sample.size), y, drop = TRUE)
    } else {
        seq_len(sample.size)
    }
    # initialize betas with tensor normal estimate (ignoring data being binary)
    fit_normal <- gmlm_tensor_normal(X, F, sample.axis = length(dim(X)))
    betas <- fit_normal$betas
    Omegas <- Omegas.init <- Map(function(mode) {
        n <- prod(dim(X)[-mode])
        prob2 <- mcrossprod(X, mode = mode) / n
        prob2[prob2 == 0] <- 1 / n
        prob1 <- diag(prob2)
        `prob1^2` <- outer(prob1, prob1)
        `diag<-`(log(((1 - `prob1^2`) / `prob1^2`) * prob2 / (1 - prob2)), 0)
    }, modes)
    # Determin degenerate combinations, that are variables which are exclusive
    # in the data set
    matX <- mat(X, sample.axis)
    degen <- crossprod(matX) == 0
    degen.mask <- which(degen)
    # If there are degenerate combination, compute an (arbitrary) bound the
    # log odds parameters of those combinations
    if (any(degen.mask)) {
        degen.ind <- arrayInd(degen.mask, dim(degen))
        meanX <- colMeans(matX)
        prodX <- meanX[degen.ind[, 1]] * meanX[degen.ind[, 2]]
        degen.bounds <- log((1 - prodX) / (prodX * sample.size))
        # Component indices in Omegas of degenerate two-way interactions
        degen.ind <- arrayInd(degen.mask, rep(dimX, 2))
        degen.ind <- Map(function(d, m) {
            degen.ind[, m] + dimX[m] * (degen.ind[, m + length(dimX)] - 1L)
        }, dimX, seq_along(dimX))
        ## Enforce initial value degeneracy interaction param. constraints
        # Extract parameters corresponding to degenerate interactions
        degen.params <- do.call(rbind, Map(`[`, Omegas, degen.ind))
        # Degeneracy Constrained Parameters (sign is dropped)
        DCP <- mapply(function(vals, bound) {
            logVals <- log(abs(vals))
            err <- max(0, sum(logVals) - log(abs(bound)))
            exp(logVals - (err / length(vals)))
        }, split(degen.params, col(degen.params)), degen.bounds)
        # Update values in Omegas such that all degeneracy constraints hold
        Omegas <- Map(function(Omega, cp, ind) {
            # Combine multiple constraints for every element into single
            # constraint value per element
            cp <- mapply(min, split(abs(cp), ind))
            ind <- as.integer(names(cp))
            `[<-`(Omega, ind, sign(Omega[ind]) * cp)
        }, Omegas, split(DCP, row(DCP)), degen.ind)
    }
    # Initialize mean squared gradients
    grad2_betas <- Map(array, 0, Map(dim, betas))
    grad2_Omegas <- Map(array, 0, Map(dim, Omegas))
@ -49,12 +117,8 @@ gmlm_ising <- function(X, F, sample.axis = length(dim(X)),
    non_improving <- 0L
    # technical access points to dynamicaly access a multi-dimensional array
-    # with its last index
+    `X[..., i]` <- slice.expr(X, sample.axis, index = i, drop = FALSE)
    `X[..., i]` <- slice.expr(X, sample.axis, index = i)
    `F[..., i]` <- slice.expr(F, sample.axis, index = i, drop = FALSE)
    # the next expression if accessing the precomputed `mlm(F, betas)`
    `BF[..., i]` <- slice.expr(BF, sample.axis, nr.axis = sample.axis, drop = FALSE) # BF[..., i]
    # Iterate till a break condition triggers or till max. nr. of iterations
    for (iter in seq_len(max.iter)) {
@ -62,31 +126,36 @@ gmlm_ising <- function(X, F, sample.axis = length(dim(X)),
        grad_betas <- Map(matrix, 0, dimX, dimF)
        Omega <- Reduce(kronecker, rev(Omegas))
        # second order residuals accumulator
        # `sum_i (X_i o X_i - E[X o X | Y = y_i])`
        R2 <- array(0, dim = c(dimX, dimX))
        # negative log-likelihood
        loss <- 0
-        BF <- mlm(F, betas)
+        for (i in slices.ind) {
            # slice size (nr. of objects in the slice)
            n_i <- length(i)
-        for (i in seq_len(sample.size)) {
+            sumF_i <- rowSums(eval(`F[..., i]`), dims = length(dimF))
-            params_i <- Omega + diag(as.vector(eval(`BF[..., i]`)))
+
            diag_params_i <- mlm(sumF_i / n_i, betas)
            params_i <- Omega + diag(as.vector(diag_params_i))
            m2_i <- ising_m2(params_i)
            # accumulate loss
-            x_i <- as.vector(eval(`X[..., i]`))
+            matX_i <- mat(eval(`X[..., i]`), modes)
-            loss <- loss - (sum(x_i * (params_i %*% x_i)) + log(attr(m2_i, "prob_0")))
+            loss <- loss - (
                sum(matX_i * (params_i %*% matX_i)) + n_i * log(attr(m2_i, "prob_0"))
            )
-            R2_i <- tcrossprod(x_i) - m2_i
+            R2_i <- tcrossprod(matX_i) - n_i * m2_i
            R1_i <- diag(R2_i)
            dim(R1_i) <- dimX
            for (j in modes) {
                grad_betas[[j]] <- grad_betas[[j]] +
-                    mcrossprod(
+                    mcrossprod(R1_i, mlm(sumF_i, betas[-j], modes[-j]), j)
                        R1_i, mlm(eval(`F[..., i]`), betas[-j], modes[-j]), j,
                        dimB = ifelse(j != modes, dimX, dimF)
                    )
            }
            R2 <- R2 + as.vector(R2_i)
        }
@ -98,17 +167,14 @@ gmlm_ising <- function(X, F, sample.axis = length(dim(X)),
        }, modes)
-        # update and accumulate alternating loss
+        # update optimization behavioral trackers
        accum_sign <- sign(last_loss - loss) - accum_sign
-        # check if accumulated alternating signs exceed stopping threshold
+        non_improving <- max(0L, non_improving - 1L + 2L * (last_loss < loss))
        # check break conditions
        if (abs(accum_sign) > zig.zag.threashold) { break }
        # increment non improving counter if thats the case
        if (!(loss < last_loss)) {
            non_improving <- non_improving + 1L
        } else {
            non_improving <- 0L
        }
        if (non_improving > patience) { break }
        if (abs(last_loss - loss) < eps * last_loss) { break }
        # store current loss for the next iteration
        last_loss <- loss
@ -122,17 +188,219 @@ gmlm_ising <- function(X, F, sample.axis = length(dim(X)),
        # logging (before parameter update)
        logger(iter, loss, betas, Omegas, grad_betas, grad_Omegas)
        # gradualy decrease the step size
        step <- if (is.function(step.size)) step.size(iter) else step.size
        # Update Parameters
        betas <- Map(function(beta, grad, m2) {
-            beta + (step / (sqrt(m2) + eps)) * grad
+            beta + (step.size / (sqrt(m2) + eps)) * grad
        }, betas, grad_betas, grad2_betas)
        Omegas <- Map(function(Omega, grad, m2) {
-            Omega + (step / (sqrt(m2) + eps)) * grad
+            Omega + (step.size / (sqrt(m2) + eps)) * grad
        }, Omegas, grad_Omegas, grad2_Omegas)
        # Enforce degeneracy parameter constraints
        if (any(degen.mask)) {
            # Extract parameters corresponding to degenerate interactions
            degen.params <- do.call(rbind, Map(`[`, Omegas, degen.ind))
            # Degeneracy Constrained Parameters (sign is dropped)
            DCP <- mapply(function(vals, bound) {
                logVals <- log(abs(vals))
                err <- max(0, sum(logVals) - log(abs(bound)))
                exp(logVals - (err / length(vals)))
            }, split(degen.params, col(degen.params)), degen.bounds)
            # Update values in Omegas such that all degeneracy constraints hold
            Omegas <- Map(function(Omega, cp, ind) {
                # Combine multiple constraints for every element into single
                # constraint value per element
                cp <- mapply(min, split(abs(cp), ind))
                ind <- as.integer(names(cp))
                `[<-`(Omega, ind, sign(Omega[ind]) * cp)
            }, Omegas, split(DCP, row(DCP)), degen.ind)
        }
    }
-    list(betas = betas, Omegas = Omegas)
+    structure(
        list(eta1 = array(0, dimX), betas = betas, Omegas = Omegas),
        tensor_normal = fit_normal,
        Omegas.init = Omegas.init,
        degen.mask = degen.mask
    )
 }
 ################################################################################
 ###      Development Interactive Block (Delete / Make sim / TODO: ...)       ###
 ################################################################################
 if (FALSE) { # interactive()
 par(bg = "#1d1d1d",
    fg = "lightgray",
    col      = "#d5d5d5",
    col.axis = "#d5d5d5",
    col.lab  = "#d5d5d5",
    col.main = "#d5d5d5",
    col.sub  = "#d5d5d5", # col.sub  = "#2467d0"
    pch = 16
 )
 cex <- 1.25
 col <- colorRampPalette(c("#f15050", "#1d1d1d", "#567DCA"))(256)
 .logger <- function() {
    iter <- 0L
    assign("log", data.frame(
        iter            = rep(NA_integer_, 100000),
        loss            = rep(NA_real_, 100000),
        dist.B          = rep(NA_real_, 100000),
        dist.Omega      = rep(NA_real_, 100000),
        norm.grad.B     = rep(NA_real_, 100000),
        norm.grad.Omega = rep(NA_real_, 100000)
    ), envir = .GlobalEnv)
    assign("B.gmlm", NULL, .GlobalEnv)
    assign("Omega.gmlm", NULL, .GlobalEnv)
    function(it, loss, betas, Omegas, grad_betas, grad_Omegas) {
        # Store in global namespace (allows to stop and get the results)
        B.gmlm <- Reduce(kronecker, rev(betas))
        assign("B.gmlm", B.gmlm, .GlobalEnv)
        Omega.gmlm <- Reduce(kronecker, rev(Omegas))
        assign("Omega.gmlm", Omega.gmlm, .GlobalEnv)
        dist.B          <- dist.subspace(B.true, B.gmlm, normalize = TRUE)
        dist.Omega      <- norm(Omega.true - Omega.gmlm, "F")
        norm.grad.B     <- sqrt(sum(mapply(norm, grad_betas, "F")^2))
        norm.grad.Omega <- sqrt(sum(mapply(norm, grad_Omegas, "F")^2))
        log[iter <<- iter + 1L, ] <<- list(
            it, loss, dist.B, dist.Omega, norm.grad.B, norm.grad.Omega
        )
        cat(sprintf("\r%3d - d(B): %.3f, d(O): %.3f, |g(B)|: %.3f, |g(O)|: %.3f, loss: %.3f\033[K",
            it, dist.B, dist.Omega, norm.grad.B, norm.grad.Omega, loss))
    }
 }
 sample.size <- 1000
 dimX <- c(2, 3)              # predictor `X` dimension
 dimF <- rep(1, length(dimX))    # "function" `F(y)` of responce `y` dimension
 betas <- Map(diag, 1, dimX, dimF)
 Omegas <- list(toeplitz(c(0, -2)), toeplitz(seq(1, 0, by = -0.5)))
 B.true <- Reduce(kronecker, rev(betas))
 Omega.true <- Reduce(kronecker, rev(Omegas))
 # data sampling routine
 c(X, F, y, sample.axis) %<-% (sample.data <- function(sample.size, betas, Omegas) {
    dimX <- mapply(nrow, betas)
    dimF <- mapply(ncol, betas)
    # generate response (sample axis is last axis)
    y <- runif(prod(sample.size, dimF), -2, 2)
    F <- array(y, dim = c(dimF, sample.size))    # ~ U[-1, 1]
    Omega <- Reduce(kronecker, rev(Omegas))
    X <- apply(F, length(dim(F)), function(Fi) {
        dim(Fi) <- dimF
        params <- diag(as.vector(mlm(Fi, betas))) + Omega
        tensorPredictors::ising_sample(1, params)
    })
    dim(X) <- c(dimX, sample.size)
    list(X = X, F = F, y = y, sample.axis = length(dim(X)))
 })(sample.size, betas, Omegas)
 local({
    X.proto <- array(seq_len(prod(dimX)), dimX)
    interactions <- crossprod(mat(X, sample.axis))
    dimnames(interactions) <- rep(list(
        do.call(paste0, c("X", Map(slice.index, list(X.proto), seq_along(dimX))))
    ), 2)
    cat("Sample Size: ", sample.size, "\n")
    print.table(interactions, zero.print = ".")
 })
 # system.time({
 #     fit.gmlm <- gmlm_ising(X, y, logger = .logger())
 # })
 Rprof()
 gmlm_ising(X, y)
 Rprof(NULL)
 summaryRprof()
 B.gmlm <- Reduce(kronecker, rev(fit.gmlm$betas))
 Omega.gmlm <- Reduce(kronecker, rev(fit.gmlm$Omegas))
 B.normal <- Reduce(kronecker, rev(attr(fit.gmlm, "tensor_normal")$betas))
 Omega.init <- Reduce(kronecker, rev(attr(fit.gmlm, "Omegas.init")))
 degen.mask <- attr(fit.gmlm, "degen.mask")
 local({
    layout(matrix(c(
        1, 2, 3, 3, 3,
        1, 4, 5, 6, 7
    ), nrow = 2, byrow = TRUE), width = c(6, 3, 1, 1, 1))
    with(na.omit(log), {
        plot(range(iter), c(0, 1), type = "n", bty = "n",
            xlab = "Iterations", ylab = "Distance")
        lines(iter, dist.B, col = "red", lwd = 2)
        lines(iter, dist.Omega / max(dist.Omega), col = "blue", lwd = 2)
        lines(iter, (loss - min(loss)) / diff(range(loss)), col = "darkgreen", lwd = 2)
        norm.grad <- sqrt(norm.grad.B^2 + norm.grad.Omega^2)
        # Scale all gradient norms
        norm.grad.B     <- norm.grad.B     / max(norm.grad)
        norm.grad.Omega <- norm.grad.Omega / max(norm.grad)
        norm.grad       <- norm.grad       / max(norm.grad)
        lines(iter, norm.grad.B,     lty = 2, col = "red")
        lines(iter, norm.grad.Omega, lty = 2, col = "blue")
        lines(iter, norm.grad,       lty = 2, col = "darkgreen")
        axis(4, at = c(
            tail(dist.B, 1),
            min(dist.B)
        ), labels = round(c(
            tail(dist.B, 1),
            min(dist.B)
        ), 2), col = NA, col.ticks = "red", las = 1)
        axis(4, at = c(
            1,
            tail(dist.Omega, 1) / max(dist.Omega),
            min(dist.Omega) / max(dist.Omega)
        ), labels = round(c(
            max(dist.Omega),
            tail(dist.Omega, 1),
            min(dist.Omega)
        ), 2), col = NA, col.ticks = "blue", las = 1)
        abline(h = c(tail(dist.B, 1), min(dist.B)),
            lty = "dotted", col = "red")
        abline(h = c(max(dist.Omega), tail(dist.Omega, 1), min(dist.Omega)) / max(dist.Omega),
            lty = "dotted", col = "blue")
    })
    legend("topright", col = c("red", "blue", "darkgreen"), lty = 1, lwd = 2,
        legend = c("dist.B", "dist.Omega", "loss"), bty = "n")
    zlim <- max(abs(range(Omega.true, Omega.init, Omega.gmlm))) * c(-1, 1)
    matrixImage(Omega.true,   main = "true",   zlim = zlim, add.values = TRUE, col = col, cex = cex)
    matrixImage(round(Omega.init, 2), main = "init (cond. prob.)", zlim = zlim, add.values = TRUE, col = col, cex = cex)
    mtext(round(norm(Omega.true - Omega.init, "F"), 3), 3)
    matrixImage(round(Omega.gmlm, 2),   main = "gmlm (ising)",   zlim = zlim, add.values = TRUE, col = col, cex = cex,
        col.values = c(par("col"), "red")[`[<-`(array(1, rep(prod(dim(X)[-sample.axis]), 2)), degen.mask, 2)])
    mtext(round(norm(Omega.true - Omega.gmlm, "F"), 3), 3)
    zlim <- max(abs(range(B.true, B.normal, B.gmlm))) * c(-1, 1)
    matrixImage(B.true,             main = "true",
        zlim = zlim, add.values = TRUE, col = col, cex = cex)
    matrixImage(round(B.normal, 2), main = "init (normal)",
        zlim = zlim, add.values = TRUE, axes = FALSE, col = col, cex = cex)
    mtext(round(dist.subspace(B.true, B.normal, normalize = TRUE), 3), 3)
    matrixImage(round(B.gmlm, 2),   main = "gmlm (ising)",
        zlim = zlim, add.values = TRUE, axes = FALSE, col = col, cex = cex)
    mtext(round(dist.subspace(B.true, B.gmlm, normalize = TRUE), 3), 3)
 })
 }
--- a/tensorPredictors/R/gmlm_tensor_normal.R
+++ b/tensorPredictors/R/gmlm_tensor_normal.R
@ -1,109 +1,10 @@
 # p <- 5
 # A <- matrix(rnorm(p^2), p)
 # mat.proj("TriDiag", p)(A)
 # mat.proj("SymTriDiag", p)(A)
 # A
 # (AA <- mat.proj("PSD", p)(A))
 # mat.proj("PSD", p)(AA)
 # p <- 5
 # A <- matrix(rnorm(p^2), p)
 # projection <- function(T2) {
 #     P <- pinv(T2) %*% T2
 #     function(A) {
 #         matrix(P %*% as.vector(A), nrow(A))
 #     }
 # }
 # # All equal diagonal matrix, A = a * I_p
 # T2 <- t(as.vector(diag(p)))
 # proj <- projection(T2)
 # print.table( diag(mean(diag(A)), nrow(A), ncol(A))          , zero.print = ".")
 # print.table( matrix((pinv(T2) %*% T2) %*% as.vector(A), p)  , zero.print = ".")
 # print.table( proj(A)                                        , zero.print = ".")
 # # Diagonal matrix, A = diag(a_1, ..., a_p)
 # T2 <- matrix(seq_len(p^2), p)
 # T2 <- outer(diag(T2), as.vector(T2), `==`)
 # storage.mode(T2) <- "double"
 # proj <- projection(T2)
 # print.table( T2 , zero.print = ".")
 # print.table( diag(diag(A))                                  , zero.print = ".")
 # print.table( proj(A)                                        , zero.print = ".")
 # # Tri-Diagonal Matrix
 # T2 <- matrix(seq_len(p^2), p)
 # T2 <- outer(T2[abs(row(A) - col(A)) <= 1], as.vector(T2), `==`)
 # storage.mode(T2) <- "double"
 # triDiag.mask <- (abs(row(A) - col(A)) <= 1)
 # storage.mode(triDiag.mask) <- "double"
 # print.table( T2 , zero.print = ".")
 # print.table( triDiag.mask , zero.print = ".")
 # print.table( A * triDiag.mask                               , zero.print = ".")
 # print.table( matrix((pinv(T2) %*% T2) %*% as.vector(A), p)  , zero.print = ".")
 # # All equal main and off diagonals
 # T2 <- Reduce(rbind, Map(function(i) {
 #     as.double(row(A) == i + col(A))
 # }, (1 - p):(p - 1)))
 # print.table( T2 , zero.print = ".")
 # print.table( matrix((pinv(T2) %*% T2) %*% as.vector(A), p)  , zero.print = ".")
 # # Symmetric all equal main and off diagonals
 # T2 <- Reduce(rbind, Map(function(i) {
 #     as.double(abs(row(A) - col(A)) == i)
 # }, 0:(p - 1)))
 # print.table( T2 , zero.print = ".")
 # print.table( matrix((pinv(T2) %*% T2) %*% as.vector(A), p)  , zero.print = ".")
 # # Symetric Matrix
 # index <- matrix(seq_len(p^2), p)
 # T2 <- Reduce(rbind, Map(function(i) {
 #     e_i <- index == i
 #     as.double(e_i | t(e_i))
 # }, index[lower.tri(index, diag = TRUE)]))
 # proj <- projection(T2)
 # print.table( T2 , zero.print = ".")
 # print.table( 0.5 * (A + t(A))   , zero.print = ".")
 # print.table( proj(A)            , zero.print = ".")
 # proj(solve(A))
 # solve(proj(A))
 #' Specialized version of GMLM for the tensor normal model
 #'
 #' The underlying algorithm is an ``iterative (block) coordinate descent'' method
 #'
 #' @export
 gmlm_tensor_normal <- function(X, F, sample.axis = length(dim(X)),
-    max.iter = 100L, proj.betas = NULL, proj.Omegas = NULL, logger = NULL,       # TODO: proj.betas NOT USED!!!
+    max.iter = 100L, proj.betas = NULL, proj.Omegas = NULL, logger = NULL,
    cond.threshold = 25, eps = 1e-6
 ) {
    # rearrange `X`, `F` such that the last axis enumerates observations
@ -152,7 +53,7 @@ gmlm_tensor_normal <- function(X, F, sample.axis = length(dim(X)),
    }, mcov(F, sample.axis, center = FALSE))
    # initialization of betas ``covariance direction mappings``
-    betas <- Map(function(dX, dF) {
+    betas <- betas.init <- Map(function(dX, dF) {
        s <- min(ncol(dX), nrow(dF))
        crossprod(dX[1:s, , drop = FALSE], dF[1:s, , drop = FALSE])
    }, dirsX, dirsF)
@ -226,132 +127,8 @@ gmlm_tensor_normal <- function(X, F, sample.axis = length(dim(X)),
    }
-    list(eta1 = meanX, betas = betas, Omegas = Omegas)
+    structure(
        list(eta1 = mlm(meanX, Sigmas), betas = betas, Omegas = Omegas),
        betas.init = betas.init
    )
 }
 # #### DEBUGGING
 # library(tensorPredictors)
 # # setup dimensions
 # n <- 1e3
 # p <- c(5, 7, 3)
 # q <- c(2, 5, 3)
 # max.iter <- 10L
 # # create "true" GLM parameters
 # eta1 <- array(rnorm(prod(p)), dim = p)
 # betas <- Map(matrix, Map(rnorm, p * q), p)
 # Omegas <- Map(function(p_j) {
 #     solve(0.5^abs(outer(seq_len(p_j), seq_len(p_j), `-`)))
 # }, p)
 # true.params <- list(eta1 = eta1, betas = betas, Omegas = Omegas)
 # # compute tensor normal parameters from GMLM parameters
 # Sigmas <- Map(solve, Omegas)
 # mu <- mlm(eta1, Sigmas)
 # # sample some test data
 # sample.axis <- length(p) + 1L
 # F <- array(rnorm(n * prod(q)), dim = c(q, n))
 # X <- mlm(F, Map(`%*%`, Sigmas, betas)) + rtensornorm(n, mu, Sigmas, sample.axis)
 # # setup a logging callback
 # format <- sprintf("iter: %%3d, %s, Omega: %%8.2f, resid: %%8.3f\n",
 #     paste0("beta.", seq_along(betas), ": %.3f", collapse = ", ")
 # )
 # hist <- data.frame(
 #     iter = seq(0, max.iter),
 #     dist.B = numeric(max.iter + 1),
 #     dist.Omega = numeric(max.iter + 1),
 #     resid = numeric(max.iter + 1)
 # )
 # logger <- function(iter, betas, Omegas, resid) {
 #     dist.B <- dist.kron.norm(true.params$betas, betas)
 #     dist.Omega <- dist.kron.norm(true.params$Omegas, Omegas)
 #     resid <- mean(resid^2)
 #     hist[iter + 1, -1] <<- c(dist.B = dist.B, dist.Omega = dist.Omega, resid = resid)
 #     cat(do.call(sprintf, c(
 #         list(format, iter),
 #         Map(dist.subspace, betas, true.params$betas),
 #         dist.Omega, resid
 #     )))
 # }
 # # run the model
 # est.params <- gmlm_tensor_normal(X, F, max.iter = max.iter, logger = logger)
 # # run model with Omegas in sub-manifolds of SPD matrices
 # est.params <- gmlm_tensor_normal(X, F, max.iter = max.iter, logger = logger,
 #     proj.Omegas = list(
 #         NULL,
 #         (function(nrow, ncol) {
 #             triDiag.mask <- (abs(.row(c(nrow, ncol)) - .col(c(nrow, ncol))) <= 1)
 #             storage.mode(triDiag.mask) <- "double"
 #             function(A) { A * triDiag.mask }
 #         })(p[2], p[2]),
 #         function(A) diag(diag(A))
 #     ))
 # # # Profile the fitting routine without logging
 # # profvis::profvis({
 # #     est.params <- gmlm_tensor_normal(X, F, max.iter = max.iter)
 # # })
 # with(hist, {
 #     plot(range(iter), range(hist[, -1]), type = "n", log = "y")
 #     lines(iter, resid, col = "red")
 #     lines(iter, dist.B, col = "green")
 #     lines(iter, dist.Omega, col = "blue")
 #     legend("topright", legend = c("resid", "dist.B", "dist.Omega"), col = c("red", "green", "blue"), lty = 1)
 # })
 # # par(mfrow = c(1, 2))
 # # matrixImage(Reduce(kronecker, rev(est.params$Omegas)))
 # # matrixImage(Reduce(kronecker, rev(true.params$Omegas)))
 # # # matrixImage(Reduce(kronecker, rev(est.params$Omegas)) - Reduce(kronecker, rev(true.params$Omegas)))
 # # par(mfrow = c(1, 2))
 # # matrixImage(Reduce(kronecker, rev(true.params$betas)), main = "True", col = hcl.colors(48, palette = "Blue-Red 3"))
 # # matrixImage(Reduce(kronecker, rev(est.params$betas)), main = "Est", col = hcl.colors(48, palette = "Blue-Red 3"))
 # # # unlist(Map(kappa, mcov(X)))
 # # # unlist(Map(kappa, Omegas))
 # # # unlist(Map(kappa, Sigmas))
 # # # max(unlist(Map(function(C) max(eigen(C)$values), mcov(X))))
 # # # min(unlist(Map(function(C) min(eigen(C)$values), mcov(X))))
 # # # Map(function(C) eigen(C)$values, mcov(X))
 # # # kappa(Reduce(kronecker, mcov(X)))
 # # kappa1.fun <- function(Omegas) Map(kappa, Omegas)
 # # kappa2.fun <- function(Omegas) Map(kappa, Omegas, exact = TRUE)
 # # eigen1.fun <- function(Omegas) {
 # #     Map(function(Omega) {
 # #         min_max <- range(eigen(Omega)$values)
 # #         min_max[2] / min_max[1]
 # #     }, Omegas)
 # # }
 # # eigen2.fun <- function(Omegas) {
 # #     Map(function(Omega) {
 # #         min_max <- range(eigen(Omega, TRUE, TRUE)$values)
 # #         min_max[2] / min_max[1]
 # #     }, Omegas)
 # # }
 # # microbenchmark::microbenchmark(
 # #     kappa1.fun(Omegas),
 # #     kappa2.fun(Omegas),
 # #     eigen1.fun(Omegas),
 # #     eigen2.fun(Omegas)
 # # )
--- a/tensorPredictors/R/matrixImage.R
+++ b/tensorPredictors/R/matrixImage.R
@ -19,6 +19,7 @@
 matrixImage <- function(A, add.values = FALSE,
    main = NULL, sub = NULL, interpolate = FALSE, ..., zlim = NA,
    axes = TRUE, asp = 1, col = hcl.colors(24, "Blue-Red 3", rev = FALSE),
    col.values = par("col"), cex = 1,
    digits = getOption("digits"), new.plot = TRUE
 ) {
    # plot raster image
@ -56,6 +57,7 @@ matrixImage <- function(A, add.values = FALSE,
            A[!add.values] <- NA
            A[add.values] <- format(A[add.values], digits = digits)
        }
-        text(rep(x - 0.5, each = nrow(A)), rep(rev(y - 0.5), ncol(A)), A, adj = 0.5)
+        text(rep(x - 0.5, each = nrow(A)), rep(rev(y - 0.5), ncol(A)), A,
             adj = 0.5, cex = cex, col = col.values)
    }
 }