fix: stability of gmlm_ising and added slicing,

add: higher-order approx.kron, add: paper.tex - simulation plots and tensor normal MLE estimation
2023-11-21 12:21:43 +01:00 · 2023-11-21 12:21:43 +01:00 · 399f878fbb
commit 399f878fbb
parent 6477d78190
8 changed files with 830 additions and 401 deletions
--- a/LaTeX/paper.tex
+++ b/LaTeX/paper.tex
@ -487,15 +487,15 @@ Equation \eqref{eq:eta2} is underdetermined since $\t{(\mat{T}_2\pinv{\mat{D}_p}
 \todo{Maybe already here introduce the ``constraint'' set of $\Omega$'s allowed as $\{ \Omega\in\mathbb{R}_{++}^{p\times p} : \vec{\Omega} = \t{(\pinv{(\mat{T}_2\pinv{\mat{D}_p})}\mat{T}_2\pinv{\mat{D}_p})}\vec{\Omega} \}$}

 In a classical \emph{generalized linear model} (GLM), the link function connecting the natural parameters to the expectation of the sufficient statistic $\mat{\eta}_y = \mat{g}(\E[\mat{t}(\ten{X}) \mid Y = y])$ is invertible. Such a link may not exist in our setting, but for our purpose what we call the ``inverse'' link suffices. The ``inverse'' link $\widetilde{\mat{g}}$ exists as the natural parameters fully describe the distribution. As in the non-minimal formulation \eqref{eq:quadratic-exp-fam}, we define the ``inverse'' link through its tensor valued components
-\begin{align*}
-    \ten{g}_1(\mat{\eta}_y) &= \E[\ten{X} \mid Y = y], \\
-    \ten{g}_2(\mat{\eta}_y) &= \E[\ten{X}\circ\ten{X} \mid Y = y]
-\end{align*}
+\begin{align}
+    \ten{g}_1(\mat{\eta}_y) &= \E[\ten{X} \mid Y = y], \label{eq:inv-link1}\\
+    \ten{g}_2(\mat{\eta}_y) &= \E[\ten{X}\circ\ten{X} \mid Y = y] \label{eq:inv-link2}
+\end{align}
 as $\widetilde{\mat{g}}(\mat{\eta}_y) = (\vec\ten{g}_1(\mat{\eta}_y), \vec\ten{g}_2(\mat{\eta}_y))$.

 Under the quadratic exponential family model \eqref{eq:quadratic-exp-fam},  a sufficient reduction for the regression of $Y$ on $\ten{X}$ is given in \cref{thm:sdr}.
 \begin{theorem}[SDR]\label{thm:sdr}
-    A sufficient reduction for the regression $Y\mid \ten{X}$ under the quadratic exponential family inverse regression model \eqref{eq:quadratic-exp-fam} with natural parameters \eqref{eq:eta1} and \eqref{eq:eta2} is given by
+    A sufficient reduction for the regression $Y\mid \ten{X}$ under the quadratic exponential family inverse regression model \eqref{eq:quadratic-exp-fam} with natural parameters modeled \eqref{eq:eta1} and \eqref{eq:eta2} is given by
    \begin{align}\label{eq:sdr}
        \ten{R}(\ten{X})
            % &= (\ten{X} - \E\ten{X})\times\{ \t{\mat{\beta}_1}, \ldots, \t{\mat{\beta}_r} \}.
@ -551,7 +551,7 @@ The maximum likelihood estimate of $\mat{\theta}_0$ is the solution to the optim
 \end{equation}
 with $\hat{\mat{\theta}}_n = (\vec\widehat{\overline{\ten{\eta}}}, \vec\widehat{\mat{B}}, \vech\widetilde{\mat{\Omega}})$ where $\widehat{\mat{B}} = \bigkron_{k = r}^{1}\widehat{\mat{\beta}}_k$ and $\widehat{\mat{\Omega}} = \bigkron_{k = r}^{1}\widehat{\mat{\Omega}}_k$.

-A straightforward and general method for parameter estimation is \emph{gradient descent}. For pure algorithmic speedup, by only changing the update rule but \emph{not} the gradient denoted $\nabla_{\mat{\theta}}l_n$ of the objective function, we use an adapted version of \emph{Nesterov Accelerated Gradient} descent (NAG) originaly described in \textcite{Nesterov1983} with an internal line search to determine the step size in each iteration, as described in \cref{alg:NAGD}. For applying gradient based optimization, we compute the gradients of $l_n$ in \cref{thm:grad}.
+A straightforward and general method for parameter estimation is \emph{gradient descent}. For applying gradient based optimization, we compute the gradients of $l_n$ in \cref{thm:grad}.

 \begin{theorem}\label{thm:grad}
    For $n$ i.i.d. observations $(\ten{X}_i, y_i), i = 1, ..., n$ the log-likelihood has the form \eqref{eq:log-likelihood} with $\mat{\theta}$ being the collection of all GMLM parameters $\overline{\ten{\eta}}$, ${\mat{B}} = \bigkron_{k = r}^{1}{\mat{\beta}}_k$ and ${\mat{\Omega}} = \bigkron_{k = r}^{1}{\mat{\Omega}}_k$ for $k = 1, ..., r$. Furthermore, let $\ten{G}_2(\mat{\eta}_y)$ be a tensor of dimensions $p_1, \ldots, p_r$ such that
@ -569,43 +569,84 @@ A straightforward and general method for parameter estimation is \emph{gradient
    If $\mat{T}_2$ is the identity matrix $\mat{I}_{p(p + 1) / 2}$, then $\ten{G}_2(\mat{\eta}_y) = \ten{g}_2(\mat{\eta}_y)$.
 \end{theorem}

-\begin{algorithm}[hpt!]
-    \caption{\label{alg:NAGD}Nesterov Accelerated Gradient Descent}
-    \begin{algorithmic}[1]
-        \State Objective: $l_n(\mat{\theta})$
-        \State Arguments: Order $r + 1$ tensors $\ten{X}$, $\ten{F}$ \todo{ask Effie about input description}
-        \State Initialize: Parameters $\mat{\theta}^{(0)}$, $0 < \delta^{(1)}$ and $0 < \gamma < 1$
-        \\
-        \State $t \leftarrow 1$
-            \Comment{step counter}
-        \State $\mat{\theta}^{(1)} \leftarrow \mat{\theta}^{(0)}$
-            \Comment{artificial first step}
-        \State $(m^{(0)}, m^{(1)}) \leftarrow (0, 1)$
-            \Comment{momentum extrapolation weights}
-        \\
-        \Repeat \Comment{repeat untill convergence}
-            \State $\mat{M} \leftarrow \mat{\theta}^{(t)} + \frac{m^{(t - 1)} - 1}{m^{(t)}}(\mat{\theta}^{(t)} - \mat{\theta}^{(t - 1)})$ \Comment{momentum extrapolation}
-            \For{$\delta = \gamma^{-1}\delta^{(t)}, \delta^{(t)}, \gamma\delta^{(t)}, \gamma^2\delta^{(t)}, \ldots$} \Comment{line search}
-                \State $\mat{\theta} \leftarrow \mat{M} + \delta \nabla l_n(\mat{M})$
-                \If{$l_n(\mat{\theta}) \leq l_n(\mat{\theta}^{(t - 1)}) - \delta \|\nabla_{\mat{\theta}} l_n(\mat{M})\|_F^2$} \Comment{Armijo condition}
-                    \State $\mat{\theta}^{(t + 1)} \leftarrow \mat{\theta}$
-                    \State $\delta^{(t + 1)} \leftarrow \delta$
-                    \Break
-                \EndIf
-            \EndFor
-            \State $m^{(t + 1)} \leftarrow \frac{1 + \sqrt{1 + (2 m^{(t)})^2}}{2}$ \Comment{update extrapolation weights}
-            \State $t \leftarrow t + 1$
-        \Until converged
-    \end{algorithmic}
-\end{algorithm}
+Although the general case of any GMLM model can be fitted via gradient descent using \cref{thm:grad}, this may be very inefficient. In case of the tensor normal distribution an iterative cyclic updating scheme can be derived by setting the partial gradients of \cref{thm:grad} to zero. Described in more details in \cref{sec:tensor_normal_estimation}, this method has much faster convergence, is stabel and does not requires any hyper parameters. On the other hand, the Ising model does not allow such a scheme. There we need to use a gradient based method, which is the subject of \cref{sec:ising_estimation}.

-\todo{Update the following, especially check that $\ten{X}$ needs to be center (as in the current version implicitly assumed)}
-\cref{alg:NAGD} requires initial parameter estimates $\mat{\theta}^{(0)}$. Therefore, we choose a simple heuristic approach by starting with $\mat{\Omega}_k^{(0)} = \mat{I}_{p_k}$ and $\overline{\ten{\eta}}_1^{(0)}$ dependent on the particular distribution. For example for a Tensor Normal distribution the sample mean is a good choice while for the Ising model zero initialization has nice properties. The interesting part are the multi-linear predictors $\mat{\beta}_k$. Therefore, let $\mat{\Sigma}_k = \frac{q_k}{n q}\sum_{i = 1}^n {\ten{F}_{y_i}}_{(k)}\t{{\ten{F}_{y_i}}_{(k)}}$ and $\mat{\Delta}_k = \frac{p_k}{n p}\sum_{i = 1}^n {\ten{X}_{i}}_{(k)}\t{{\ten{X}_{i}}_{(k)}}$ mode wise sample covariance estimates. For both we take the $s_k = \min(p_k, q_k)$ order SVD approximation as $\mat{\Delta}_k \approx \mat{U}_k\mat{D}_k\t{\mat{U}_k}$ and $\mat{\Sigma}_k \approx \mat{V}_k\mat{S}_k\t{\mat{V}_k}$. The approximate SVD gives us the $s_k$ first singular vectors with dimensions $\mat{U}_k(p_k\times s_k), \mat{D}_k(s_k\times s_k)$ and $\mat{V}_k(q_k\times s_k), \mat{S}_k(s_k\times s_k)$. Then we initialize
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\subsection{Tensor Normal}\label{sec:tensor_normal_estimation}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+With $\ten{X}\mid Y = y$ following a tensor normal distribution, its density, with mean $\ten{\mu}_y$ and covariance $\mat{\Sigma} = \bigkron_{k = r}^{1}\mat{\Sigma}_k$ is given by
 \begin{displaymath}
-    \mat{\beta}_k^{(0)} = \mat{U}_k\sqrt{\mat{D}_k}\sqrt{\mat{S}_k}\t{\mat{V}_k}.
+    f_{\mat{\theta}}(\ten{X}\mid Y = y) = (2\pi)^{-p / 2}\prod_{k = 1}^{r}\det(\mat{\Sigma}_k)^{-p / 2 p_k}\exp\left( -\frac{1}{2}\left\langle\ten{X} - \ten{\mu}_y, (\ten{X} - \ten{\mu}_y)\mlm_{k = 1}^{r}\mat{\Sigma}_k^{-1} \right\rangle \right).
+\end{displaymath}
+We assume the distribution is non-degenerate which means that the covariances $\mat{\Sigma}_k$ are symmetric positive definite matrices. For the sake of simplicity, we w.l.o.g. assume $\ten{X}$ to have marginal expectation equal zero, $\E\ten{X} = 0$. Rewriting this in the quadratic exponential family form \eqref{eq:quadratic-exp-fam}, determines tha scaling constant $c = -1/2$, and give the relation to the GMLM parameters $\overline{\ten{\eta}}, \mat{\beta}_k$ and $\mat{\Omega}_k$, for $k = 1, \ldots, r$ as
+\begin{displaymath}
+    \ten{\mu}_y = \ten{F}_y\mlm_{k = 1}^{r}\mat{\Omega}_k^{-1}\mat{\beta}_k, \qquad
+    \mat{\Omega}_k = \mat{\Sigma}_k^{-1}
+\end{displaymath}
+where we used that $\overline{\ten{\eta}} = 0$ due to $0 = \E\ten{X} = \E\E[\ten{X}\mid Y] = \E\ten{\mu}_Y$ in combination with $\E\ten{F}_Y = 0$. Additionally, all the $\mat{\Omega}_k$'s are symmetric positive definite, because the $\mat{\Sigma}_k$'s are. This lead to another simplification the matrix $\mat{T}_2$ from \eqref{eq:t-stat} is equal to the identity. This also means that the gradients of the log-likelihood $l_n$ in \cref{thm:grad} are simpler as well. We get
+\begin{displaymath}
+    \ten{g}_1(\mat{\eta}_y) = \E[\ten{X}\mid Y = y] = \ten{\mu}_y, \qquad
+    \ten{G}_2(\mat{\eta}_y) = \ten{g}_2(\mat{\eta}_y) = \E[\ten{X}\circ\ten{X}\mid Y = y] \equiv \bigkron_{k = r}^1\mat{\Sigma}_k + (\vec{\ten{\mu}}_y)\t{(\vec{\ten{\mu}}_y)}.
 \end{displaymath}

-The default hyper-parameters in \cref{alg:NAGD} are set to $\delta^{(1)} = 1 / 100$ for the initial step size and the line search step parameter $\gamma = 2 / (1 + \sqrt{5})$ is the inverse golden ratio.
+In practice, we start the estimation process by demeaning a given data set $(\ten{X}_i, \ten{F}_{y_i})$ with $i = 1, \ldots, n$ observations. After demenaing, only the reduction matrices $\mat{\beta}_k$ and the scatter matrices $\mat{\Omega}_k$ need to be estimated. Then, to solve the optimization problem \eqref{eq:mle}, with $\overline{\ten{\eta}} = 0$ after demeaning, we need some initial values. For initial estimates $\hat{\mat{\beta}}_k^{(0)}$ we use a simple heuristic approach. First, we compute moment based mode-wise marginal covariance estimates $\widehat{\mat{\Sigma}}_k(\ten{X})$ and $\widehat{\mat{\Sigma}}_k(\ten{F}_Y)$ given by
+\begin{displaymath}
+    \widehat{\mat{\Sigma}}_k(\ten{X})   = \frac{1}{n}\sum_{i = 1}^{n} (\ten{X}_i)_{(k)}\t{(\ten{X}_i)_{(k)}}, \qquad
+    \widehat{\mat{\Sigma}}_k(\ten{F}_Y) = \frac{1}{n}\sum_{i = 1}^{n} (\ten{F}_{y_i})_{(k)}\t{(\ten{F}_{y_i})_{(k)}}.
+\end{displaymath}
+Then, for every mode $k = 1, \ldots, r$, we compute the first $j = 1, \ldots, q_k$ eigen vectors $\mat{v}_j(\widehat{\mat{\Sigma}}_k(\ten{X}))$, $\mat{v}_j(\widehat{\mat{\Sigma}}_k(\ten{F}_Y))$ and eigen values $\lambda_j(\widehat{\mat{\Sigma}}_k(\ten{X}))$, $\lambda_j(\widehat{\mat{\Sigma}}_k(\ten{X}))$ of the marginal covariance estimates. Let,
+\begin{align*}
+    \mat{U}_k &= (\mat{v}_1(\widehat{\mat{\Sigma}}_1(\ten{X})), \ldots, \mat{v}_{q_k}(\widehat{\mat{\Sigma}}_{q_k}(\ten{X}))), \\
+    \mat{D}_k &= \diag(\mat{v}_1(\widehat{\mat{\Sigma}}_1(\ten{X}))\mat{v}_1(\widehat{\mat{\Sigma}}_1(\ten{F}_{Y})), \ldots, \mat{v}_{q_k}(\widehat{\mat{\Sigma}}_{q_k}(\ten{X}))\mat{v}_{q_k}(\widehat{\mat{\Sigma}}_k(\ten{F}_{Y}))), \\
+    \mat{V}_k &= (\mat{v}_1(\widehat{\mat{\Sigma}}_1(\ten{F}_Y), \ldots, \mat{v}_{q_k}(\widehat{\mat{\Sigma}}_{q_k}(\ten{F}_Y)). \\
+\end{align*}
+The initial estimates are then set to
+\begin{displaymath}
+    \hat{\mat{\beta}}_k^{(0)} = \mat{U}_k\sqrt{\mat{D}_k}\t{\mat{V}_k}.
+\end{displaymath}
+The initial values of $\mat{\Omega}_k$ are set to the identity $\mat{\Omega}_k^{(0)} = \mat{I}_{p_k}$.
+
+Given any estimates $\hat{\mat{\beta}}_1, \ldots, \hat{\mat{\beta}}_r, \hat{\mat{\Omega}}_1, \ldots, \hat{\mat{\Omega}}_r$, we take the gradient $\nabla_{\mat{\beta}_j}$ of the log-likelihood $l_n$ in \eqref{eq:log-likelihood} of the tensor normal distributed from \cref{thm:grad} and keep all other parameters except $\mat{\beta}_j$ fixed. Then, $\nabla_{\mat{\beta}_j}l_n = 0$ has a closed form solution
+\begin{equation}\label{eq:tensor_normal_beta_solution}
+    \t{\mat{\beta}_j} = \biggl(
+        \sum_{i = 1}^{n}
+        \Bigl( \ten{F}_{y_i}\mlm_{k \neq j}\hat{\mat{\Omega}}_k^{-1}\hat{\mat{\beta}}_k \Bigr)_{(j)}
+        \t{\Bigl( \ten{F}_{y_i}\mlm_{k \neq j}\hat{\mat{\beta}}_k \Bigr)_{(j)}}
+    \biggr)^{-1}
+    \biggl(
+        \sum_{i = 1}^{n}
+        \Bigl( \ten{F}_{y_i}\mlm_{k \neq j}\hat{\mat{\beta}}_k \Bigr)_{(j)}
+        \t{(\ten{X}_{i})_{(j)}}
+    \biggr)
+        \hat{\mat{\Omega}}_j.
+\end{equation}
+For the scatter matrices $\mat{\Omega}_j$, we need to fudge a bit. Equating the partial gradient of the $j$'th scatter matrix $\mat{\Omega}_j$ in \cref{thm:grad} to zero, $\nabla_{\mat{\Omega}_j}l_n = 0$, gives a quadratic matrix equation. This is due to the dependence of $\ten{\mu}_y$ on $\mat{\Omega}_j$. In practive thoug, it is more stable, faster and equaly accurate to use mode-wise covariance estimates via the residuals
+\begin{displaymath}
+    \hat{\ten{R}}_i = \ten{X}_i - \hat{\ten{\mu}}_{y_i} = \ten{X}_i - \ten{F}_{y_i}\mlm_{k = 1}^{r}\hat{\mat{\Omega}}_k^{-1}\hat{\mat{\beta}}_k.
+\end{displaymath} 
+The estimates are computed via the inbetween values
+\begin{displaymath}
+    \tilde{\mat{\Sigma}}_j = \sum_{i = 1}^n (\hat{\ten{R}}_i)_{(j)} \t{(\hat{\ten{R}}_i)_{(j)}}
+\end{displaymath}
+which only lacks scaling $\tilde{s}\tilde{\mat{\Sigma}}_j = \hat{\mat{\Omega}}_j^{-1}$. The scaling condition is that the mean squared error has to be equal to the trace of the covariance estimate,
+\begin{displaymath}
+    \frac{1}{n}\sum_{i = 1}^n \langle \hat{\ten{R}}_i, \hat{\ten{R}}_i \rangle = \tr\bigkron_{k = r}^{1}\hat{\mat{\Omega}}_k^{-1} = \prod_{k = 1}^{r}\tr{\hat{\mat{\Omega}}_k^{-1}} = \tilde{s}^r\prod_{k = 1}^{r}\tr{\tilde{\mat{\Sigma}}_k}.
+\end{displaymath}
+Solving for the scaling value gives
+\begin{displaymath}
+    \tilde{s} = \biggl(\Bigl(\prod_{k = 1}^{r}\tr{\tilde{\mat{\Sigma}}_k}\Bigr)^{-1}\frac{1}{n}\sum_{i = 1}^n \langle \hat{\ten{R}}_i, \hat{\ten{R}}_i \rangle\biggr)^{1 / r}
+\end{displaymath}
+resulting in the estimates $\hat{\mat{\Omega}}_j = (\tilde{s}\tilde{\mat{\Sigma}}_j)^{-1}$.
+
+Estimation is then performed by updating for $j = 1, \ldots, r$ the estimates $\hat{\mat{\beta}}_j$ via \eqref{eq:tensor_normal_beta_solution}, then we recompute the $\hat{\mat{\Omega}}_j$ estimates simultaniously keeping the $\hat{\mat{\beta}}_j$'s fixed. This procedure is repeated untill convergence. Convergence is very fast, experiments showed that convergence occures usualy in less than $10$ iterations.
+
+A technical detail for numerical stability is to ensure that the scaled inbetween values $\tilde{s}\tilde{\mat{\Sigma}}_j$, assumed to be symmetric and positive definite, are well conditioned. Otherwise, the procedure can be numerically unstable by attempting to compute $\hat{\mat{\Omega}}_j = (\tilde{s}\tilde{\mat{\Sigma}}_j)^{-1}$. Therefore, we estimate the condition number of $\tilde{s}\tilde{\mat{\Sigma}}_j$ before computing the inverse. In case of being ill conditioned, we regularize and use $\hat{\mat{\Omega}}_j = (\tilde{s}\tilde{\mat{\Sigma}}_j + 0.2 \lambda_{1}(\tilde{s}\tilde{\mat{\Sigma}}_j)\mat{I}_{p_j})^{-1}$ instead, where $\lambda_{1}(\tilde{s}\tilde{\mat{\Sigma}}_j)$ is the first (maximum) eigen value. Experiments showed that this regularization is usually required in the first few iterations only.
+
+Furthermore, we may assume that the parameter space follows a more generel setting as in \cref{thm:param-manifold}. In this case, updating may produces estimates outside of the parameter space. A simple and efficient method is to project every updated estimate onto the corresponding manifold.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\subsection{Ising Model}\label{sec:ising_estimation}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


 \newpage
@ -674,6 +715,8 @@ As a basis to ensure that the constraint parameter space $\Theta$ is a manifold,
    then the constrained parameter space $\Theta = \mathbb{R}^p \times \manifold{K}_{\mat{B}}\times\manifold{CK}_{\mat{\Omega}}\subset\mathbb{R}^{p(p + 2 q + 3) / 2}$ is a smooth embedded manifold.
 \end{theorem}

+\subsection{Matrix Manifolds}\label{sec:matrix-manifolds}
+
 {\color{red}
    Flexibility of Manifold Structure

@ -716,55 +759,6 @@ It is not necessary to have a perfect maximizer, as long as the objective has fi
    with asymptotic variance-covariance structure $\mat{\Sigma}_{\mat{\theta}_0}$ given in \eqref{eq:asymptotic-covariance-gmlm}.
 \end{theorem}

-% \begin{theorem}[{\textcite[Thm~5.35]{asymStats-van_der_Vaart1998}}]
-%     Let $\{ p_{\mat{\theta}} : \mat{\theta}\in\Theta \}$ be a collection of identifiable subprobability\footnote{A subprobability measure is a measure $\nu$ for the measure space $(\mathfrak{X}, \nu)$ such that $\nu(\mathfrak{X}) \leq 1$.} densities such that $P_{\mat{\theta}_0}$ is a probability measure. Then $M(\mat{\theta}) = P_{\mat{\theta}_0}\log p_{\mat{\theta}} / p_{\mat{\theta}_0}$ attains its maximum uniquely at $\mat{\theta}_0$.
-% \end{theorem}
-
-% \begin{proof}[Proof of \cref{thm:asymptotic-normality-gmlm}]
-%     First we prove the consistency for which we apply \cref{thm:consistency-on-metric-spaces} and then prove that there
-
-%     Most parts of the proof are identical to the proof \textcite[Thm~3.1]{asymptoticMLE-BuraEtAl2018} with addaptations in a few places.
-
-%     {\color{green}Bla Bla Bla ...}
-
-%     \todo{regularity conditions!}
-
-%     By rewriting \eqref{eq:eta1} as
-%     \begin{displaymath}
-%         \mat{\eta}_{1y} =
-%         \vec{\overline{\ten{\eta}}} + \mat{B}\vec{\ten{F}_y} = (\vec{\overline{\ten{\eta}}}, \mat{B})\begin{pmatrix}
-%             1 \\ \vec{\ten{F}_y}
-%         \end{pmatrix} = (\mat{I}_p\otimes \t{(1, \vec{\ten{F}_y})})\vec((\vec{\overline{\ten{\eta}}}, \mat{B}))
-%     \end{displaymath}
-%     the natural parameter $\mat{\eta}_y$ with components \eqref{eq:eta1}, \eqref{eq:eta2} can be written with $\mat{\theta} = (\vec{\overline{\ten{\eta}}}, \vec{\mat{B}}, \vec{\mat{\Omega}})$ as
-%     \begin{displaymath}
-%         \mat{\eta}_y = \begin{pmatrix}
-%             \mat{I}_p\otimes \t{(1, \vec{\ten{F}_y})} & 0 \\
-%             0 & c\t{(\pinv{(\mat{T}_2\pinv{\mat{D}_p})})}
-%         \end{pmatrix}\mat{\theta} = \mat{F}(y)\mat{\theta}
-%     \end{displaymath}
-%     Using the new form, let with $z = (\ten{X}, y)$
-%     \begin{displaymath}
-%         m_{\mat{\theta}}(z) = \langle \mat{F}(y)\mat{\theta}, \mat{t}(\ten{X}) \rangle - b(\mat{F}(y)\mat{\theta})
-%     \end{displaymath}
-%     which allows to write the log-likelihood \eqref{eq:log-likelihood}, scaled by $n^{-1}$, with $Z_i = (\ten{X}_i, Y_i)$, as
-%     \begin{displaymath}
-%         M_n(\mat{\theta}) = \frac{1}{n}\sum_{i = 1}^{n} m_{\mat{\theta}}(Z_i) = \frac{1}{n} l_n(\mat{\theta}).
-%     \end{displaymath}
-
-%     The strict convexity of $b$ ensures that $\mat{\theta}\mapsto m_{\mat{\theta}}(z)$ is strictly concave for every fixed $z$. Taking expectation preserves the concavity yielding $M(\mat{\theta}) = \E m_{\mat{\theta}}(Z)$ to be concave aswell. The identifiability \todo{???} of \todo{???} allows to apply \textcite[Thm~5.35]{asymStats-van_der_Vaart1998} which gives for the family of conditional densities $\{ f_{\mat{\theta}}(\,.\mid Y) : \mat{\theta}\in\Theta \}$ that
-%     \begin{displaymath}
-%         \E[m_{\mat{\theta}}(Z)\mid Y] \leq \E[m_{\mat{\theta}_0}(Z)\mid Y].
-%     \end{displaymath}
-%     Taking the expectation with respect to $Y$ gives $M(\mat{\theta}) = \E[m_{\mat{\theta}}(Z)] \leq \E[m_{\mat{\theta}_0}(Z)] = M(\mat{\theta}_0)$ for every $\mat{\theta}$. Now, assume their exists a $\mat{\theta}_1$ such that $M(\mat{\theta}_1) = M(\mat{\theta}_0)$, then $P(\mat{F}(Y)\mat{\theta}_1 = \mat{F}(Y)\mat{\theta}_0) = 1$ which means \todo{figure that out why} which contradicts \todo{$P(\mat{F}(Y)\mat{\theta} = \overline{\ten{\eta}}) = 1$ which is a violaten of an assumption.} Finally, with the integrability assumtion we can applying \textcite[Thm~A.1]{asymptoticMLE-BuraEtAl2018} which gives that the MLE $\hat{\mat{\theta}}_n$ of \eqref{eq:mle} exists and is unique.\todo{Fix that!!!}
-
-    
-%     {\color{green}Foo Bar Baz ...}
-
-%     \todo{finish me!}
-% \end{proof}
-
-

 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{Asymptotic Normality}
@ -814,38 +808,96 @@ The following is a reformulation of \textcite[Lemma~2.3]{asymptoticMLE-BuraEtAl2
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 We compair our method with a few other methods for the tensor normal and the Ising model (inverse Ising problem).

-\subsection{Tensor Normal Distribution}
-The \emph{tensor normal distribution} $\mathcal{TN}(\ten{\mu}, \mat{\Sigma}_1, ..., \mat{\Sigma}_r)$ is a generalization of the \emph{matrix normal distribution} $\mathcal{MN}(\mat{\mu}, \mat{\Sigma}_1, \mat{\Sigma}_2)$. The density of the conditional tensor normal distribution $\ten{X}\mid Y = y$ is given by
-\begin{displaymath}
-    f_{\mat{\theta}_y}(\ten{X}\mid Y = y) = (2\pi)^{-p/2}\prod_{k = 1}^r |\mat{\Sigma}_{k}|^{-p / 2 p_{k}}\exp\Big(
-        -\frac{1}{2}\Big\langle \ten{X} - \ten{\mu}_y, (\ten{X} - \ten{\mu}_y)\mlm_{k\in[r]}\mat{\Sigma}_{k} \Big\rangle
-    \Big)
-\end{displaymath}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\subsection{Tensor Normal}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

-\subsection{Ising Model}
+
+\begin{itemize}
+    \item[a] A simple setup with linear relation for 
+    \item[b] 
+    \item[c] 
+    \item[d] 
+    \item[e] Missspecified model .......
+\end{itemize}

 \begin{figure}
    \centering
-    \begin{scaletikzpicturetowidth}{0.5\textwidth}
-        \input{plots/sim-normal.tex}
-    \end{scaletikzpicturetowidth}
-    \caption{\label{fig:sim-normal}As a test case we show the $3$-tensor normal (generalization of the matrix normal)}
+    \includegraphics[width = \textwidth]{plots/sim-normal.pdf}
+    \caption{\label{fig:sim-normal}asknclknasknc}
 \end{figure}


-\subsection{Matrix Normal and Ising Model}

-If $\mat{X} \in \mathbb{R}^{p_1 \times p_2}$, then $\mat{\theta} = (\overline{\eta}, \mat{\beta}_1, \mat{\beta}_2, \mat{\Omega}_1, \mat{\Omega}_2)$ and $\mat{F}_y$ is also matrix valued. The conditional pdf of $\mat{X}\mid Y$ is
-\begin{align*}
-    f_{\mat{\theta}}(\mat{X}\mid Y = y)
-        &= h(\mat{X})\exp(\langle\mat{X}, \mat{\eta}_1(\mat{\theta})\rangle + \langle\mat{X}\circ\mat{X}, \mat{\eta}_2(\mat{\theta})\rangle - b(\mat{\eta}_y(\mat{\theta}))) \\
-        &= h(\mat{X})\exp(\tr((\overline{\mat{\eta}} + \mat{\beta}_1\mat{F}_y\t{\mat{\beta}_2})\t{\mat{X}}) + \tr(\mat{\Omega}_1\mat{X}\mat{\Omega}_2\t{\mat{X}}) - b(\mat{\eta}_y(\mat{\theta}))).
-\end{align*}
-The MLE estimate $\hat{\mat{\theta}}_n = (\widehat{\overline{\mat{\eta}}}, \widehat{\mat{\beta}}_2\otimes \widehat{\mat{\beta}}_1, \widehat{\mat{\Omega}}_2\otimes \widehat{\mat{\Omega}}_2)$ is asymptotically normal and
-\begin{displaymath}
-    \widehat{\ten{R}}(\mat{X}) = \t{(\widehat{\mat{\beta}}_2\otimes \widehat{\mat{\beta}}_1)}\vec(\mat{X} - \E\mat{X}) \equiv \t{\widehat{\mat{\beta}}_1}(\mat{X} - \E\mat{X})\widehat{\mat{\beta}}_2
-\end{displaymath}
-is the MLE of the sufficient reduction $\ten{R}(\mat{X})$ of dimension $q_1 \times q_2 \leq p_1 \times p_2 $.\\[1.6em]
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\subsection{Ising Model}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\begin{figure}
+    \centering
+    \includegraphics[]{plots/sim-ising.pdf}
+    \caption{\label{fig:sim-ising}asknclknasknc}
+\end{figure}
+
+\begin{table}
+    \begin{tabular}{c | ccc ccc c}
+        $n$ &             GMLM  &         PCA &       HOPCA &       LPCA  &       CLPCA &        TSIR &       MGCCA \\
+        \hline
+        100 & {\bf 0.34} (0.14) & 0.90 (0.04) & 0.90 (0.05) & 0.94 (0.09) & 0.91 (0.03) & 0.48 (0.19) & 0.55 (0.13) \\
+        200 & {\bf 0.25} (0.11) & 0.90 (0.03) & 0.90 (0.03) & 0.96 (0.07) & 0.91 (0.02) & 0.38 (0.16) & 0.53 (0.10) \\
+        300 & {\bf 0.20} (0.09) & 0.89 (0.02) & 0.89 (0.02) & 0.97 (0.06) & 0.91 (0.02) & 0.29 (0.13) & 0.51 (0.11) \\
+        500 & {\bf 0.16} (0.07) & 0.90 (0.02) & 0.90 (0.02) & 0.98 (0.01) & 0.91 (0.01) & 0.23 (0.10) & 0.50 (0.08) \\
+        750 & {\bf 0.13} (0.05) & 0.90 (0.01) & 0.90 (0.01) & 0.98 (0.02) & 0.91 (0.01) & 0.23 (0.08) & 0.53 (0.06)
+    \end{tabular}
+    \caption{\label{tab:sim-ising}xyz uvw}
+\end{table}
+
+% The \emph{tensor normal distribution} $\mathcal{TN}(\ten{\mu}, \mat{\Sigma}_1, ..., \mat{\Sigma}_r)$ is a generalization of the \emph{matrix normal distribution} $\mathcal{MN}(\mat{\mu}, \mat{\Sigma}_1, \mat{\Sigma}_2)$.  \todo{ref} The density of the conditional tensor normal distribution $\ten{X}\mid Y = y$ according to the quadratic exponential family \eqref{eq:quadratic-exp-fam} where only the first moment depends on $y$ is given by
+% \begin{displaymath}
+%     f_{\mat{\theta}}(\ten{X}\mid Y = y) = (2\pi)^{-p/2}\prod_{k = 1}^r |\mat{\Sigma}_{k}|^{-p / 2 p_{k}}\exp\Big(
+%         -\frac{1}{2}\Big\langle \ten{X} - \ten{\mu}_y, (\ten{X} - \ten{\mu}_y)\mlm_{k\in[r]}\mat{\Sigma}_{k} \Big\rangle
+%     \Big)
+% \end{displaymath}
+% Rewriting this in the form of an exponential family as in \eqref{eq:quadratic-exp-fam} allows to determin the natural parameter components $\mat{\eta}_{y1}$ and $\mat{\eta}_2$. Since a non-degenerate normal distribution requires the covariance matrices $\mat{\Sigma}_k$ to be symmetric positive definite the relation to the second moment natural parameter $\mat{\eta}_2$ simplifies as we can set $\mat{T}_2$ in \eqref{eq:eta2-manifold} to the identity. This then gives the relation to the natural parameters as in \eqref{eq:eta1-manifold} and \eqref{eq:eta2-manifold} as
+% \begin{displaymath}
+%     \mat{\eta}_{1y} = \vec\Bigl(\ten{\mu}_y\mlm_{k = 1}^{r}\mat{\Sigma}_k\Bigr), \qquad
+%     \mat{\eta}_2    = c\t{\mat{D}_p}\vec\bigkron_{k = r}^{1}\mat{\Sigma}_k^{-1}
+% \end{displaymath}
+% with scaling constant $c = -1 / 2$. Modeling the natural parameters as in \eqref{eq:eta1} and \eqref{eq:eta2} relates the mean $\ten{\mu}_y$ and the covariance matrices $\mat{\Sigma}_k$ of the tensor normal to the generalized multi-linear model parameter $\overline{\ten{\eta}}$ and $\mat{\beta}_k$, $\mat{\Omega}_k$, for $k = 1, \ldots, r$ through
+% \begin{displaymath}
+%     \ten{\mu}_y = \Bigl(\overline{\ten{\eta}} + \ten{F}_y\mlm_{j = 1}^{r}\mat{\beta}_j\Bigr)\mlm_{k = 1}^{r}\mat{\Omega}_k^{-1}, \qquad \mat{\Omega}_k = \mat{\Sigma}_k^{-1}.
+% \end{displaymath}
+% This completely determines the tensor normal distribution given the GMLM parameter.
+
+
+
+
+% To estimate the GMLM parameters $\mat{\theta} = (\overline{\ten{\eta}}, \mat{\beta}_1, \ldots, \mat{\beta}_r, \mat{\Omega}_1, \ldots\mat{\Omega}_r)$ given a data set $(\ten{X}_i, y_i)$ of $i = 1, \ldots, n$ observation we use the gradients provided by \cref{thm:grad}. It turns out that the equations $\nabla_{\overline{\ten{\eta}}}l_n = 0, \nabla_{\mat{\beta}_j}l_n = 0$ and $\nabla_{\mat{\Omega}_j}l_n = 0$, for $j = 1, \ldots, r$, can be solved for the differentiation variable assuming all the other parameter blocks to be constant. Centering the observed $\ten{X}$ leads to a cleaner formulation given by  \todo{fix the following!}
+% \begin{align*}
+%     \hat{\overline{\ten{\eta}}} &= \frac{1}{n}\sum_{i = 1}^{n} \ten{X}_i, \\
+%     \t{\hat{\mat{\beta}}_j} &=  \mat{\Omega}_j \Bigl(\Bigl(\ten{F}_{y_i}\mlm_{k \neq j}\mat{\Omega}_k^{-1}\mat{\beta}_k\Bigr)_{(j)}\t{\Bigl(\ten{F}_{y_i}\mlm_{k \neq j}\mat{\beta}_k\Bigr)_{(j)}}\Bigr)^{-1}\Bigl(\ten{F}_{y_i}\mlm_{k \neq j}\mat{\beta}_k\Bigr)_{(j)}\ten{X}_{(j)}, \\
+%     \t{\Omega}_j &= scaling \frac{1}{n}
+% \end{align*}
+
+% This allows to use a \emph{block coordinate descent} method instead of gradient descent. This method keeps all but one parameter block fixed and optimized the objective for a single block. Given a closed form solution for the partial gradients, only a single update is required to solve the partial optimization problem. This means that the block coordinate descent method reduces to a cyclic updating. This not only converges very fast, it also does not require any hyper parameters.
+
+% For this iterative scheme we do need some initial estimates. For those we 
+
+
+
+% \subsection{Matrix Normal and Ising Model}
+
+% If $\mat{X} \in \mathbb{R}^{p_1 \times p_2}$, then $\mat{\theta} = (\overline{\eta}, \mat{\beta}_1, \mat{\beta}_2, \mat{\Omega}_1, \mat{\Omega}_2)$ and $\mat{F}_y$ is also matrix valued. The conditional pdf of $\mat{X}\mid Y$ is
+% \begin{align*}
+%     f_{\mat{\theta}}(\mat{X}\mid Y = y)
+%         &= h(\mat{X})\exp(\langle\mat{X}, \mat{\eta}_1(\mat{\theta})\rangle + \langle\mat{X}\circ\mat{X}, \mat{\eta}_2(\mat{\theta})\rangle - b(\mat{\eta}_y(\mat{\theta}))) \\
+%         &= h(\mat{X})\exp(\tr((\overline{\mat{\eta}} + \mat{\beta}_1\mat{F}_y\t{\mat{\beta}_2})\t{\mat{X}}) + \tr(\mat{\Omega}_1\mat{X}\mat{\Omega}_2\t{\mat{X}}) - b(\mat{\eta}_y(\mat{\theta}))).
+% \end{align*}
+% The MLE estimate $\hat{\mat{\theta}}_n = (\widehat{\overline{\mat{\eta}}}, \widehat{\mat{\beta}}_2\otimes \widehat{\mat{\beta}}_1, \widehat{\mat{\Omega}}_2\otimes \widehat{\mat{\Omega}}_2)$ is asymptotically normal and
+% \begin{displaymath}
+%     \widehat{\ten{R}}(\mat{X}) = \t{(\widehat{\mat{\beta}}_2\otimes \widehat{\mat{\beta}}_1)}\vec(\mat{X} - \E\mat{X}) \equiv \t{\widehat{\mat{\beta}}_1}(\mat{X} - \E\mat{X})\widehat{\mat{\beta}}_2
+% \end{displaymath}
+% is the MLE of the sufficient reduction $\ten{R}(\mat{X})$ of dimension $q_1 \times q_2 \leq p_1 \times p_2 $.\\[1.6em]

 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \appendix
--- a/LaTeX/plots/sim-ising.tex
+++ b/LaTeX/plots/sim-ising.tex
@ -0,0 +1,151 @@
+%%% R code to generate the input data files from corresponding simulation logs
+% R> setwd("~/Work/tensorPredictors")
+% R>
+% R> for (sim.name in c("2a")) {
+% R>     pattern <- paste0("sim\\_", sim.name, "\\_ising\\-[0-9T]*\\.csv")
+% R>     log.file <- sort(
+% R>         list.files(path = "sim/", pattern = pattern, full.names = TRUE),
+% R>         decreasing = TRUE
+% R>     )[[1]]
+% R>
+% R>     sim <- read.csv(log.file)
+% R>
+% R>     aggr <- aggregate(sim[, names(sim) != "sample.size"], list(sample.size = sim$sample.size), mean)
+% R>
+% R>     write.table(aggr, file = paste0("LaTeX/plots/aggr-", sim.name, "-ising.csv"), row.names = FALSE, quote = FALSE)
+% R> }
+\documentclass[border=0cm]{standalone}
+
+\usepackage{tikz}
+\usepackage{pgfplots}
+\usepackage{bm}
+
+\definecolor{gmlm}{RGB}{0,0,0}
+\definecolor{mgcca}{RGB}{86,180,233}
+\definecolor{tsir}{RGB}{0,158,115}
+\definecolor{pca}{RGB}{240,228,66}
+\definecolor{hopca}{RGB}{230,159,0}
+\definecolor{lpca}{RGB}{127,127,127}
+\definecolor{clpca}{RGB}{191,191,191}
+
+\pgfplotsset{
+    every axis/.style={
+        xtick={100,200,300,500,750},
+        ymin=-0.05, ymax=1.05,
+        grid=both,
+        grid style={gray, dotted}
+    },
+    every axis plot/.append style={
+        mark = *,
+        mark size = 1pt,
+        line width=0.8pt
+    }
+}
+\tikzset{
+    legend entry/.style={
+        mark = *,
+        mark size = 1pt,
+        mark indices = {2},
+        line width=0.8pt
+    }
+}
+
+\begin{document}
+\begin{tikzpicture}[>=latex]
+
+    \begin{axis}[
+        name=sim-2a
+    ]
+        \addplot[color = gmlm, line width=1pt]  table[x = sample.size, y = dist.subspace.gmlm]  {aggr-1a-normal.csv};
+        \addplot[color = pca]   table[x = sample.size, y = dist.subspace.pca]   {aggr-2a-ising.csv};
+        \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-2a-ising.csv};
+        \addplot[color = tsir]  table[x = sample.size, y = dist.subspace.tsir]  {aggr-2a-ising.csv};
+        \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-2a-ising.csv};
+        \addplot[color = lpca]  table[x = sample.size, y = dist.subspace.lpca]  {aggr-2a-ising.csv};
+        \addplot[color = clpca] table[x = sample.size, y = dist.subspace.clpca] {aggr-2a-ising.csv};
+    \end{axis}
+    \node[anchor = base west, yshift = 0.3em] at (sim-2a.north west) {
+        a: small
+    };
+
+    % \begin{axis}[
+    %     name=sim-1b,
+    %     anchor=north west, at={(sim-2a.right of north east)}, xshift=0.1cm,
+    %     xticklabel=\empty,
+    %     ylabel near ticks, yticklabel pos=right
+    % ]
+    %     \addplot[color = pca]   table[x = sample.size, y = dist.subspace.pca]   {aggr-1b-normal.csv};
+    %     \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1b-normal.csv};
+    %     \addplot[color = tsir]  table[x = sample.size, y = dist.subspace.tsir]  {aggr-1b-normal.csv};
+    %     \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1b-normal.csv};
+    %     \addplot[color = gmlm, line width=1pt]  table[x = sample.size, y = dist.subspace.gmlm]  {aggr-1b-normal.csv};
+    % \end{axis}
+    % \node[anchor = base west, yshift = 0.3em] at (sim-1b.north west) {
+    %     b: cubic dependence on $y$
+    % };
+
+    % \begin{axis}[
+    %     name=sim-1c,
+    %     anchor=north west, at={(sim-2a.below south west)}, yshift=-.8em,
+    %     xticklabel=\empty
+    % ]
+    %     \addplot[color = pca]   table[x = sample.size, y = dist.subspace.pca]   {aggr-1c-normal.csv};
+    %     \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1c-normal.csv};
+    %     \addplot[color = tsir]  table[x = sample.size, y = dist.subspace.tsir]  {aggr-1c-normal.csv};
+    %     \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1c-normal.csv};
+    %     \addplot[color = gmlm, line width=1pt]  table[x = sample.size, y = dist.subspace.gmlm]  {aggr-1c-normal.csv};
+    % \end{axis}
+    % \node[anchor = base west, yshift = 0.3em] at (sim-1c.north west) {
+    %     c: rank $1$ $\boldsymbol{\beta}$'s
+    % };
+
+    % \begin{axis}[
+    %     name=sim-1d,
+    %     anchor=north west, at={(sim-1c.right of north east)}, xshift=0.1cm,
+    %     ylabel near ticks, yticklabel pos=right
+    % ]
+    %     \addplot[color = pca]   table[x = sample.size, y = dist.subspace.pca]   {aggr-1d-normal.csv};
+    %     \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1d-normal.csv};
+    %     \addplot[color = tsir]  table[x = sample.size, y = dist.subspace.tsir]  {aggr-1d-normal.csv};
+    %     \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1d-normal.csv};
+    %     \addplot[color = gmlm, line width=1pt]  table[x = sample.size, y = dist.subspace.gmlm]  {aggr-1d-normal.csv};
+    % \end{axis}
+    % \node[anchor = base west, yshift = 0.3em] at (sim-1d.north west) {
+    %     d: tri-diagonal $\boldsymbol{\Omega}$'s
+    % };
+
+    % \begin{axis}[
+    %     name=sim-1e,
+    %     anchor=north west, at={(sim-1c.below south west)}, yshift=-.8em
+    % ]
+    %     \addplot[color = pca]   table[x = sample.size, y = dist.subspace.pca]   {aggr-1e-normal.csv};
+    %     \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1e-normal.csv};
+    %     \addplot[color = tsir]  table[x = sample.size, y = dist.subspace.tsir]  {aggr-1e-normal.csv};
+    %     \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1e-normal.csv};
+    %     \addplot[color = gmlm, line width=1pt]  table[x = sample.size, y = dist.subspace.gmlm]  {aggr-1e-normal.csv};
+    % \end{axis}
+    % \node[anchor = base west, yshift = 0.3em] at (sim-1e.north west) {
+    %     e: missspecified
+    % };
+
+
+    \matrix[anchor = west] at (sim-2a.right of east) {
+        \draw[color=gmlm,  legend entry, line width=1pt] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {GMLM};  \\
+        \draw[color=tsir,  legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {TSIR};  \\
+        \draw[color=mgcca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {MGCCA}; \\
+        \draw[color=hopca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {HOPCA}; \\
+        \draw[color=pca,   legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {PCA};   \\
+        \draw[color=lpca,  legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {LPCA};   \\
+        \draw[color=clpca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {CLPCA};   \\
+    };
+
+
+    \node[anchor = north] at (current bounding box.south) {Sample Size $n$};
+
+    \node[anchor = south, rotate = 90] at (current bounding box.west) {Subspace Distance $d(\boldsymbol{B}, \hat{\boldsymbol{B}})$};
+    \node[anchor = south, rotate = 270] at (current bounding box.east) {\phantom{Subspace Distance $d(\boldsymbol{B}, \hat{\boldsymbol{B}})$}};
+
+    \node[anchor = south, font=\large] at (current bounding box.north) {Tensor Normal Simulation};
+
+\end{tikzpicture}
+\end{document}
--- a/LaTeX/plots/sim-normal.tex
+++ b/LaTeX/plots/sim-normal.tex
@ -0,0 +1,148 @@
+%%% R code to generate the input data files from corresponding simulation logs
+% R> setwd("~/Work/tensorPredictors")
+% R>
+% R> for (sim.name in c("1a", "1b", "1c", "1d", "1e")) {
+% R>     pattern <- paste0("sim\\_", sim.name, "\\_normal\\-[0-9T]*\\.csv")
+% R>     log.file <- sort(
+% R>         list.files(path = "sim/", pattern = pattern, full.names = TRUE),
+% R>         decreasing = TRUE
+% R>     )[[1]]
+% R>
+% R>     sim <- read.csv(log.file)
+% R>
+% R>     aggr <- aggregate(sim[, names(sim) != "sample.size"], list(sample.size = sim$sample.size), mean)
+% R>
+% R>     write.table(aggr, file = paste0("LaTeX/plots/aggr-", sim.name, "-normal.csv"), row.names = FALSE, quote = FALSE)
+% R> }
+\documentclass[border=0cm]{standalone}
+
+\usepackage{tikz}
+\usepackage{pgfplots}
+\usepackage{bm}
+
+\definecolor{gmlm}{RGB}{0,0,0}
+\definecolor{mgcca}{RGB}{86,180,233}
+\definecolor{tsir}{RGB}{0,158,115}
+\definecolor{hopca}{RGB}{230,159,0}
+\definecolor{pca}{RGB}{240,228,66}
+\definecolor{lpca}{RGB}{0,114,178}
+\definecolor{clpca}{RGB}{213,94,0}
+
+\pgfplotsset{
+    every axis/.style={
+        xtick={100,200,300,500,750},
+        ymin=-0.05, ymax=1.05,
+        grid=both,
+        grid style={gray, dotted}
+    },
+    every axis plot/.append style={
+        mark = *,
+        mark size = 1pt,
+        line width=0.8pt
+    }
+}
+\tikzset{
+    legend entry/.style={
+        mark = *,
+        mark size = 1pt,
+        mark indices = {2},
+        line width=0.8pt
+    }
+}
+
+\begin{document}
+\begin{tikzpicture}[>=latex]
+
+    \begin{axis}[
+        name=sim-1a,
+        xticklabel=\empty
+    ]
+        \addplot[color = gmlm, line width=1pt]  table[x = sample.size, y = dist.subspace.gmlm]  {aggr-1a-normal.csv};
+        \addplot[color = pca]   table[x = sample.size, y = dist.subspace.pca]   {aggr-1a-normal.csv};
+        \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1a-normal.csv};
+        \addplot[color = tsir]  table[x = sample.size, y = dist.subspace.tsir]  {aggr-1a-normal.csv};
+        \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1a-normal.csv};
+    \end{axis}
+    \node[anchor = base west, yshift = 0.3em] at (sim-1a.north west) {
+        a: linear dependence on $\mathcal{F}_y \equiv y$
+    };
+
+    \begin{axis}[
+        name=sim-1b,
+        anchor=north west, at={(sim-1a.right of north east)}, xshift=0.1cm,
+        xticklabel=\empty,
+        ylabel near ticks, yticklabel pos=right
+    ]
+        \addplot[color = pca]   table[x = sample.size, y = dist.subspace.pca]   {aggr-1b-normal.csv};
+        \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1b-normal.csv};
+        \addplot[color = tsir]  table[x = sample.size, y = dist.subspace.tsir]  {aggr-1b-normal.csv};
+        \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1b-normal.csv};
+        \addplot[color = gmlm, line width=1pt]  table[x = sample.size, y = dist.subspace.gmlm]  {aggr-1b-normal.csv};
+    \end{axis}
+    \node[anchor = base west, yshift = 0.3em] at (sim-1b.north west) {
+        b: cubic dependence on $y$
+    };
+
+    \begin{axis}[
+        name=sim-1c,
+        anchor=north west, at={(sim-1a.below south west)}, yshift=-.8em,
+        xticklabel=\empty
+    ]
+        \addplot[color = pca]   table[x = sample.size, y = dist.subspace.pca]   {aggr-1c-normal.csv};
+        \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1c-normal.csv};
+        \addplot[color = tsir]  table[x = sample.size, y = dist.subspace.tsir]  {aggr-1c-normal.csv};
+        \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1c-normal.csv};
+        \addplot[color = gmlm, line width=1pt]  table[x = sample.size, y = dist.subspace.gmlm]  {aggr-1c-normal.csv};
+    \end{axis}
+    \node[anchor = base west, yshift = 0.3em] at (sim-1c.north west) {
+        c: rank $1$ $\boldsymbol{\beta}$'s
+    };
+
+    \begin{axis}[
+        name=sim-1d,
+        anchor=north west, at={(sim-1c.right of north east)}, xshift=0.1cm,
+        ylabel near ticks, yticklabel pos=right
+    ]
+        \addplot[color = pca]   table[x = sample.size, y = dist.subspace.pca]   {aggr-1d-normal.csv};
+        \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1d-normal.csv};
+        \addplot[color = tsir]  table[x = sample.size, y = dist.subspace.tsir]  {aggr-1d-normal.csv};
+        \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1d-normal.csv};
+        \addplot[color = gmlm, line width=1pt]  table[x = sample.size, y = dist.subspace.gmlm]  {aggr-1d-normal.csv};
+    \end{axis}
+    \node[anchor = base west, yshift = 0.3em] at (sim-1d.north west) {
+        d: tri-diagonal $\boldsymbol{\Omega}$'s
+    };
+
+    \begin{axis}[
+        name=sim-1e,
+        anchor=north west, at={(sim-1c.below south west)}, yshift=-.8em
+    ]
+        \addplot[color = pca]   table[x = sample.size, y = dist.subspace.pca]   {aggr-1e-normal.csv};
+        \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1e-normal.csv};
+        \addplot[color = tsir]  table[x = sample.size, y = dist.subspace.tsir]  {aggr-1e-normal.csv};
+        \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1e-normal.csv};
+        \addplot[color = gmlm, line width=1pt]  table[x = sample.size, y = dist.subspace.gmlm]  {aggr-1e-normal.csv};
+    \end{axis}
+    \node[anchor = base west, yshift = 0.3em] at (sim-1e.north west) {
+        e: missspecified
+    };
+
+
+    \matrix[anchor = center] at (sim-1e.right of east -| sim-1d.south) {
+        \draw[color=gmlm,  legend entry, line width=1pt] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {GMLM};  \\
+        \draw[color=tsir,  legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {TSIR};  \\
+        \draw[color=mgcca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {MGCCA}; \\
+        \draw[color=hopca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {HOPCA}; \\
+        \draw[color=pca,   legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {PCA};   \\
+    };
+
+
+    \node[anchor = north] at (current bounding box.south) {Sample Size $n$};
+
+    \node[anchor = south, rotate = 90] at (current bounding box.west) {Subspace Distance $d(\boldsymbol{B}, \hat{\boldsymbol{B}})$};
+    \node[anchor = south, rotate = 270] at (current bounding box.east) {\phantom{Subspace Distance $d(\boldsymbol{B}, \hat{\boldsymbol{B}})$}};
+
+    \node[anchor = south, font=\large] at (current bounding box.north) {Tensor Normal Simulation};
+
+\end{tikzpicture}
+\end{document}
--- a/tensorPredictors/NAMESPACE
+++ b/tensorPredictors/NAMESPACE
@ -11,8 +11,8 @@ export(.projRank)
 export(.projSymBand)
 export(.projSymRank)
 export(CISE)
-export(D)
-export(D.pinv)
+export(Dup)
+export(Dup.pinv)
 export(GMLM)
 export(GMLM.default)
 export(HOPCA)
@ -32,6 +32,7 @@ export(RMReg)
 export(RMap)
 export(S)
 export(TSIR)
+export(approx.kron)
 export(approx.kronecker)
 export(colKronecker)
 export(decompose.kronecker)
@ -41,6 +42,7 @@ export(dist.projection)
 export(dist.subspace)
 export(exprs.all.equal)
 export(gmlm_ising)
+export(gmlm_mlm)
 export(gmlm_tensor_normal)
 export(icu_tensor_normal)
 export(ising_m2)
@ -72,6 +74,7 @@ export(reduce)
 export(riccati)
 export(rowKronecker)
 export(rtensornorm)
+export(slice.assign.expr)
 export(slice.expr)
 export(slice.select)
 export(sylvester)
--- a/tensorPredictors/R/approx_kronecker.R
+++ b/tensorPredictors/R/approx_kronecker.R
@ -1,3 +1,31 @@
+#' @examples
+#' nrows <- c(3, 2, 5)
+#' ncols <- c(2, 4, 8)
+#'
+#' A <- Reduce(kronecker, Map(matrix, Map(rnorm, nrows * ncols), nrows))
+#'
+#' all.equal(A, Reduce(kronecker, approx.kron(A, nrows, ncols)))
+#'
+#' @export
+approx.kron <- function(A, nrows, ncols) {
+
+    # rearrange kronecker product `A` into outer product `R`
+    dim(A) <- c(rev(nrows), rev(ncols))
+    axis.perm <- as.vector(t(matrix(seq_along(dim(A)), ncol = 2L))[, rev(seq_along(nrows))])
+    R <- aperm(A, axis.perm, resize = FALSE)
+    dim(R) <- nrows * ncols
+
+    # scaling factor for every product component
+    s <- sum(A^2)^(1 / length(dim(A)))
+
+    # higher order SVD on `R` with one singular vector
+    Map(function(mode, nr, nc) {
+        s * `dim<-`(La.svd(mcrossprod(R, mode = mode), 1L, 0L)$u, c(nr, nc))
+    }, seq_along(nrows), nrows, ncols)
+
+}
+
+
 #' Approximates Kronecker Product decomposition.
 #'
 #' Approximates the matrices `A` and `B` such that
--- a/tensorPredictors/R/gmlm_ising.R
+++ b/tensorPredictors/R/gmlm_ising.R
@ -1,42 +1,110 @@
 #' Specialized version of the GMLM for the Ising model (inverse Ising problem)
 #'
+#' @todo TODO: Add beta and Omega projections
+#'
 #' @export
 gmlm_ising <- function(X, F, sample.axis = length(dim(X)),
+    # proj.betas = ..., proj.Omegas = ...,  # TODO: this
    max.iter = 1000L,
    eps = sqrt(.Machine$double.eps),
-    step.size = function(iter) 1e-2 * (1000 / (iter + 1000)),
-    zig.zag.threashold = 5L,
-    patience = 5L,
-    nr.slices = 10L,                    # only for univariate `F(y) = y`
-    slice.method = c("cut", "ecdf"),    # only for univariate `F(y) = y` and `y` is a factor or integer
+    step.size = 1e-3,
+    zig.zag.threashold = 20L,
+    patience = 3L,
+    nr.slices = 20L,                            # only for univariate `F(y) = y`
+    slice.method = c("cut", "ecdf", "none"),    # only for univariate `F(y) = y` and `y` is a factor or integer
    logger = function(...) { }
 ) {
-    # # Special case for univariate response `vec F(y) = y`
-    # # Due to high computational costs we use slicing
-    # if (length(F) == prod(dim(F))) {
-    #     y <- as.vector(F)
-    #     if (!(is.factor(y) || is.integer(y))) {
-    #         slice.method <- match.arg(slice.method)
-    #         if (slice.method == "ecdf") {
-    #             y <- cut(ecdf(y)(y), nr.slices)
-    #         } else {
-    #             y <- cut(y, nr.slices)
-    #         }
-    #     }
-    #     slices <- split(seq_len(sample.size), y, drop = TRUE)
-    # } else {
-    #     slices <- seq_len(sample.size)
-    # }
+    # Get problem dimensions
+    dimX <- dim(X)[-sample.axis]
+    # threat scalar `F` as a tensor
+    if (is.null(dim(F))) {
+        dimF <- rep(1L, length(dimX))
+        dim(F) <- ifelse(seq_along(dim(X)) == sample.axis, sample.size, 1L)
+    } else {
+        dimF <- dim(F)[-sample.axis]
+    }
+    sample.size <- dim(X)[sample.axis]

-    dimX <- head(dim(X), -1)
-    dimF <- head(dim(F), -1)
-    sample.axis <- length(dim(X))
-    modes <- seq_len(length(dim(X)) - 1)
-    sample.size <- tail(dim(X), 1)
+    # rearrange `X`, `F` such that the last axis enumerates observations
+    if (sample.axis != length(dim(X))) {
+        axis.perm <- c(seq_along(dim(X))[-sample.axis], sample.axis)
+        X <- aperm(X, axis.perm)
+        F <- aperm(F, axis.perm)
+        sample.axis <- length(dim(X))
+    }
+    modes <- seq_along(dimX)

-    betas <- Map(matrix, Map(rnorm, dimX * dimF), dimX)
-    Omegas <- Map(diag, dimX)
+    # Special case for univariate response `vec F(y) = y`
+    # Due to high computational costs we use slicing
+    slice.method <- match.arg(slice.method)
+    slices.ind <- if ((slice.method != "none") && (length(F) == prod(dim(F)))) {
+        y <- as.vector(F)
+        if (!(is.factor(y) || is.integer(y))) {
+            slice.method <- match.arg(slice.method)
+            if (slice.method == "ecdf") {
+                y <- cut(ecdf(y)(y), nr.slices)
+            } else {
+                y <- cut(y, nr.slices)
+            }
+        }
+        split(seq_len(sample.size), y, drop = TRUE)
+    } else {
+        seq_len(sample.size)
+    }

+
+    # initialize betas with tensor normal estimate (ignoring data being binary)
+    fit_normal <- gmlm_tensor_normal(X, F, sample.axis = length(dim(X)))
+    betas <- fit_normal$betas
+
+    Omegas <- Omegas.init <- Map(function(mode) {
+        n <- prod(dim(X)[-mode])
+        prob2 <- mcrossprod(X, mode = mode) / n
+        prob2[prob2 == 0] <- 1 / n
+        prob1 <- diag(prob2)
+        `prob1^2` <- outer(prob1, prob1)
+
+        `diag<-`(log(((1 - `prob1^2`) / `prob1^2`) * prob2 / (1 - prob2)), 0)
+    }, modes)
+
+    # Determin degenerate combinations, that are variables which are exclusive
+    # in the data set
+    matX <- mat(X, sample.axis)
+    degen <- crossprod(matX) == 0
+    degen.mask <- which(degen)
+    # If there are degenerate combination, compute an (arbitrary) bound the
+    # log odds parameters of those combinations
+    if (any(degen.mask)) {
+        degen.ind <- arrayInd(degen.mask, dim(degen))
+        meanX <- colMeans(matX)
+        prodX <- meanX[degen.ind[, 1]] * meanX[degen.ind[, 2]]
+        degen.bounds <- log((1 - prodX) / (prodX * sample.size))
+        # Component indices in Omegas of degenerate two-way interactions
+        degen.ind <- arrayInd(degen.mask, rep(dimX, 2))
+        degen.ind <- Map(function(d, m) {
+            degen.ind[, m] + dimX[m] * (degen.ind[, m + length(dimX)] - 1L)
+        }, dimX, seq_along(dimX))
+
+        ## Enforce initial value degeneracy interaction param. constraints
+        # Extract parameters corresponding to degenerate interactions
+        degen.params <- do.call(rbind, Map(`[`, Omegas, degen.ind))
+        # Degeneracy Constrained Parameters (sign is dropped)
+        DCP <- mapply(function(vals, bound) {
+            logVals <- log(abs(vals))
+            err <- max(0, sum(logVals) - log(abs(bound)))
+            exp(logVals - (err / length(vals)))
+        }, split(degen.params, col(degen.params)), degen.bounds)
+        # Update values in Omegas such that all degeneracy constraints hold
+        Omegas <- Map(function(Omega, cp, ind) {
+            # Combine multiple constraints for every element into single
+            # constraint value per element
+            cp <- mapply(min, split(abs(cp), ind))
+            ind <- as.integer(names(cp))
+            `[<-`(Omega, ind, sign(Omega[ind]) * cp)
+        }, Omegas, split(DCP, row(DCP)), degen.ind)
+    }
+
+    # Initialize mean squared gradients
    grad2_betas <- Map(array, 0, Map(dim, betas))
    grad2_Omegas <- Map(array, 0, Map(dim, Omegas))

@ -49,12 +117,8 @@ gmlm_ising <- function(X, F, sample.axis = length(dim(X)),
    non_improving <- 0L

    # technical access points to dynamicaly access a multi-dimensional array
-    # with its last index
-    `X[..., i]` <- slice.expr(X, sample.axis, index = i)
+    `X[..., i]` <- slice.expr(X, sample.axis, index = i, drop = FALSE)
    `F[..., i]` <- slice.expr(F, sample.axis, index = i, drop = FALSE)
-    # the next expression if accessing the precomputed `mlm(F, betas)`
-    `BF[..., i]` <- slice.expr(BF, sample.axis, nr.axis = sample.axis, drop = FALSE) # BF[..., i]
-

    # Iterate till a break condition triggers or till max. nr. of iterations
    for (iter in seq_len(max.iter)) {
@ -62,31 +126,36 @@ gmlm_ising <- function(X, F, sample.axis = length(dim(X)),
        grad_betas <- Map(matrix, 0, dimX, dimF)
        Omega <- Reduce(kronecker, rev(Omegas))

+        # second order residuals accumulator
+        # `sum_i (X_i o X_i - E[X o X | Y = y_i])`
        R2 <- array(0, dim = c(dimX, dimX))

        # negative log-likelihood
        loss <- 0

-        BF <- mlm(F, betas)
+        for (i in slices.ind) {
+            # slice size (nr. of objects in the slice)
+            n_i <- length(i)

-        for (i in seq_len(sample.size)) {
-            params_i <- Omega + diag(as.vector(eval(`BF[..., i]`)))
+            sumF_i <- rowSums(eval(`F[..., i]`), dims = length(dimF))
+
+            diag_params_i <- mlm(sumF_i / n_i, betas)
+            params_i <- Omega + diag(as.vector(diag_params_i))
            m2_i <- ising_m2(params_i)

            # accumulate loss
-            x_i <- as.vector(eval(`X[..., i]`))
-            loss <- loss - (sum(x_i * (params_i %*% x_i)) + log(attr(m2_i, "prob_0")))
+            matX_i <- mat(eval(`X[..., i]`), modes)
+            loss <- loss - (
+                sum(matX_i * (params_i %*% matX_i)) + n_i * log(attr(m2_i, "prob_0"))
+            )

-            R2_i <- tcrossprod(x_i) - m2_i
+            R2_i <- tcrossprod(matX_i) - n_i * m2_i
            R1_i <- diag(R2_i)
            dim(R1_i) <- dimX

            for (j in modes) {
                grad_betas[[j]] <- grad_betas[[j]] +
-                    mcrossprod(
-                        R1_i, mlm(eval(`F[..., i]`), betas[-j], modes[-j]), j,
-                        dimB = ifelse(j != modes, dimX, dimF)
-                    )
+                    mcrossprod(R1_i, mlm(sumF_i, betas[-j], modes[-j]), j)
            }
            R2 <- R2 + as.vector(R2_i)
        }
@ -98,17 +167,14 @@ gmlm_ising <- function(X, F, sample.axis = length(dim(X)),
        }, modes)


-        # update and accumulate alternating loss
+        # update optimization behavioral trackers
        accum_sign <- sign(last_loss - loss) - accum_sign
-        # check if accumulated alternating signs exceed stopping threshold
+        non_improving <- max(0L, non_improving - 1L + 2L * (last_loss < loss))
+
+        # check break conditions
        if (abs(accum_sign) > zig.zag.threashold) { break }
-        # increment non improving counter if thats the case
-        if (!(loss < last_loss)) {
-            non_improving <- non_improving + 1L
-        } else {
-            non_improving <- 0L
-        }
        if (non_improving > patience) { break }
+        if (abs(last_loss - loss) < eps * last_loss) { break }

        # store current loss for the next iteration
        last_loss <- loss
@ -122,17 +188,219 @@ gmlm_ising <- function(X, F, sample.axis = length(dim(X)),
        # logging (before parameter update)
        logger(iter, loss, betas, Omegas, grad_betas, grad_Omegas)

-        # gradualy decrease the step size
-        step <- if (is.function(step.size)) step.size(iter) else step.size
-
        # Update Parameters
        betas <- Map(function(beta, grad, m2) {
-            beta + (step / (sqrt(m2) + eps)) * grad
+            beta + (step.size / (sqrt(m2) + eps)) * grad
        }, betas, grad_betas, grad2_betas)
        Omegas <- Map(function(Omega, grad, m2) {
-            Omega + (step / (sqrt(m2) + eps)) * grad
+            Omega + (step.size / (sqrt(m2) + eps)) * grad
        }, Omegas, grad_Omegas, grad2_Omegas)
+
+        # Enforce degeneracy parameter constraints
+        if (any(degen.mask)) {
+            # Extract parameters corresponding to degenerate interactions
+            degen.params <- do.call(rbind, Map(`[`, Omegas, degen.ind))
+            # Degeneracy Constrained Parameters (sign is dropped)
+            DCP <- mapply(function(vals, bound) {
+                logVals <- log(abs(vals))
+                err <- max(0, sum(logVals) - log(abs(bound)))
+                exp(logVals - (err / length(vals)))
+            }, split(degen.params, col(degen.params)), degen.bounds)
+            # Update values in Omegas such that all degeneracy constraints hold
+            Omegas <- Map(function(Omega, cp, ind) {
+                # Combine multiple constraints for every element into single
+                # constraint value per element
+                cp <- mapply(min, split(abs(cp), ind))
+                ind <- as.integer(names(cp))
+                `[<-`(Omega, ind, sign(Omega[ind]) * cp)
+            }, Omegas, split(DCP, row(DCP)), degen.ind)
+        }
    }

-    list(betas = betas, Omegas = Omegas)
+    structure(
+        list(eta1 = array(0, dimX), betas = betas, Omegas = Omegas),
+        tensor_normal = fit_normal,
+        Omegas.init = Omegas.init,
+        degen.mask = degen.mask
+    )
+}
+
+
+
+################################################################################
+###      Development Interactive Block (Delete / Make sim / TODO: ...)       ###
+################################################################################
+if (FALSE) { # interactive()
+
+par(bg = "#1d1d1d",
+    fg = "lightgray",
+    col      = "#d5d5d5",
+    col.axis = "#d5d5d5",
+    col.lab  = "#d5d5d5",
+    col.main = "#d5d5d5",
+    col.sub  = "#d5d5d5", # col.sub  = "#2467d0"
+    pch = 16
+)
+cex <- 1.25
+col <- colorRampPalette(c("#f15050", "#1d1d1d", "#567DCA"))(256)
+
+
+.logger <- function() {
+    iter <- 0L
+    assign("log", data.frame(
+        iter            = rep(NA_integer_, 100000),
+        loss            = rep(NA_real_, 100000),
+        dist.B          = rep(NA_real_, 100000),
+        dist.Omega      = rep(NA_real_, 100000),
+        norm.grad.B     = rep(NA_real_, 100000),
+        norm.grad.Omega = rep(NA_real_, 100000)
+    ), envir = .GlobalEnv)
+    assign("B.gmlm", NULL, .GlobalEnv)
+    assign("Omega.gmlm", NULL, .GlobalEnv)
+
+    function(it, loss, betas, Omegas, grad_betas, grad_Omegas) {
+        # Store in global namespace (allows to stop and get the results)
+        B.gmlm <- Reduce(kronecker, rev(betas))
+        assign("B.gmlm", B.gmlm, .GlobalEnv)
+        Omega.gmlm <- Reduce(kronecker, rev(Omegas))
+        assign("Omega.gmlm", Omega.gmlm, .GlobalEnv)
+
+        dist.B          <- dist.subspace(B.true, B.gmlm, normalize = TRUE)
+        dist.Omega      <- norm(Omega.true - Omega.gmlm, "F")
+        norm.grad.B     <- sqrt(sum(mapply(norm, grad_betas, "F")^2))
+        norm.grad.Omega <- sqrt(sum(mapply(norm, grad_Omegas, "F")^2))
+
+        log[iter <<- iter + 1L, ] <<- list(
+            it, loss, dist.B, dist.Omega, norm.grad.B, norm.grad.Omega
+        )
+        cat(sprintf("\r%3d - d(B): %.3f, d(O): %.3f, |g(B)|: %.3f, |g(O)|: %.3f, loss: %.3f\033[K",
+            it, dist.B, dist.Omega, norm.grad.B, norm.grad.Omega, loss))
+    }
+}
+
+
+sample.size <- 1000
+dimX <- c(2, 3)              # predictor `X` dimension
+dimF <- rep(1, length(dimX))    # "function" `F(y)` of responce `y` dimension
+
+betas <- Map(diag, 1, dimX, dimF)
+Omegas <- list(toeplitz(c(0, -2)), toeplitz(seq(1, 0, by = -0.5)))
+
+B.true <- Reduce(kronecker, rev(betas))
+Omega.true <- Reduce(kronecker, rev(Omegas))
+
+# data sampling routine
+c(X, F, y, sample.axis) %<-% (sample.data <- function(sample.size, betas, Omegas) {
+    dimX <- mapply(nrow, betas)
+    dimF <- mapply(ncol, betas)
+
+    # generate response (sample axis is last axis)
+    y <- runif(prod(sample.size, dimF), -2, 2)
+    F <- array(y, dim = c(dimF, sample.size))    # ~ U[-1, 1]
+
+    Omega <- Reduce(kronecker, rev(Omegas))
+
+    X <- apply(F, length(dim(F)), function(Fi) {
+        dim(Fi) <- dimF
+        params <- diag(as.vector(mlm(Fi, betas))) + Omega
+        tensorPredictors::ising_sample(1, params)
+    })
+    dim(X) <- c(dimX, sample.size)
+
+    list(X = X, F = F, y = y, sample.axis = length(dim(X)))
+})(sample.size, betas, Omegas)
+
+local({
+    X.proto <- array(seq_len(prod(dimX)), dimX)
+    interactions <- crossprod(mat(X, sample.axis))
+    dimnames(interactions) <- rep(list(
+        do.call(paste0, c("X", Map(slice.index, list(X.proto), seq_along(dimX))))
+    ), 2)
+    cat("Sample Size: ", sample.size, "\n")
+    print.table(interactions, zero.print = ".")
+})
+
+# system.time({
+#     fit.gmlm <- gmlm_ising(X, y, logger = .logger())
+# })
+Rprof()
+gmlm_ising(X, y)
+Rprof(NULL)
+summaryRprof()
+
+B.gmlm <- Reduce(kronecker, rev(fit.gmlm$betas))
+Omega.gmlm <- Reduce(kronecker, rev(fit.gmlm$Omegas))
+
+B.normal <- Reduce(kronecker, rev(attr(fit.gmlm, "tensor_normal")$betas))
+Omega.init <- Reduce(kronecker, rev(attr(fit.gmlm, "Omegas.init")))
+degen.mask <- attr(fit.gmlm, "degen.mask")
+
+local({
+    layout(matrix(c(
+        1, 2, 3, 3, 3,
+        1, 4, 5, 6, 7
+    ), nrow = 2, byrow = TRUE), width = c(6, 3, 1, 1, 1))
+
+    with(na.omit(log), {
+        plot(range(iter), c(0, 1), type = "n", bty = "n",
+            xlab = "Iterations", ylab = "Distance")
+
+        lines(iter, dist.B, col = "red", lwd = 2)
+        lines(iter, dist.Omega / max(dist.Omega), col = "blue", lwd = 2)
+        lines(iter, (loss - min(loss)) / diff(range(loss)), col = "darkgreen", lwd = 2)
+
+        norm.grad <- sqrt(norm.grad.B^2 + norm.grad.Omega^2)
+        # Scale all gradient norms
+        norm.grad.B     <- norm.grad.B     / max(norm.grad)
+        norm.grad.Omega <- norm.grad.Omega / max(norm.grad)
+        norm.grad       <- norm.grad       / max(norm.grad)
+        lines(iter, norm.grad.B,     lty = 2, col = "red")
+        lines(iter, norm.grad.Omega, lty = 2, col = "blue")
+        lines(iter, norm.grad,       lty = 2, col = "darkgreen")
+
+        axis(4, at = c(
+            tail(dist.B, 1),
+            min(dist.B)
+        ), labels = round(c(
+            tail(dist.B, 1),
+            min(dist.B)
+        ), 2), col = NA, col.ticks = "red", las = 1)
+        axis(4, at = c(
+            1,
+            tail(dist.Omega, 1) / max(dist.Omega),
+            min(dist.Omega) / max(dist.Omega)
+        ), labels = round(c(
+            max(dist.Omega),
+            tail(dist.Omega, 1),
+            min(dist.Omega)
+        ), 2), col = NA, col.ticks = "blue", las = 1)
+
+        abline(h = c(tail(dist.B, 1), min(dist.B)),
+            lty = "dotted", col = "red")
+        abline(h = c(max(dist.Omega), tail(dist.Omega, 1), min(dist.Omega)) / max(dist.Omega),
+            lty = "dotted", col = "blue")
+
+    })
+    legend("topright", col = c("red", "blue", "darkgreen"), lty = 1, lwd = 2,
+        legend = c("dist.B", "dist.Omega", "loss"), bty = "n")
+
+    zlim <- max(abs(range(Omega.true, Omega.init, Omega.gmlm))) * c(-1, 1)
+    matrixImage(Omega.true,   main = "true",   zlim = zlim, add.values = TRUE, col = col, cex = cex)
+    matrixImage(round(Omega.init, 2), main = "init (cond. prob.)", zlim = zlim, add.values = TRUE, col = col, cex = cex)
+    mtext(round(norm(Omega.true - Omega.init, "F"), 3), 3)
+    matrixImage(round(Omega.gmlm, 2),   main = "gmlm (ising)",   zlim = zlim, add.values = TRUE, col = col, cex = cex,
+        col.values = c(par("col"), "red")[`[<-`(array(1, rep(prod(dim(X)[-sample.axis]), 2)), degen.mask, 2)])
+    mtext(round(norm(Omega.true - Omega.gmlm, "F"), 3), 3)
+
+    zlim <- max(abs(range(B.true, B.normal, B.gmlm))) * c(-1, 1)
+    matrixImage(B.true,             main = "true",
+        zlim = zlim, add.values = TRUE, col = col, cex = cex)
+    matrixImage(round(B.normal, 2), main = "init (normal)",
+        zlim = zlim, add.values = TRUE, axes = FALSE, col = col, cex = cex)
+    mtext(round(dist.subspace(B.true, B.normal, normalize = TRUE), 3), 3)
+    matrixImage(round(B.gmlm, 2),   main = "gmlm (ising)",
+        zlim = zlim, add.values = TRUE, axes = FALSE, col = col, cex = cex)
+    mtext(round(dist.subspace(B.true, B.gmlm, normalize = TRUE), 3), 3)
+})
+
 }
--- a/tensorPredictors/R/gmlm_tensor_normal.R
+++ b/tensorPredictors/R/gmlm_tensor_normal.R
@ -1,109 +1,10 @@
-
-# p <- 5
-# A <- matrix(rnorm(p^2), p)
-
-# mat.proj("TriDiag", p)(A)
-# mat.proj("SymTriDiag", p)(A)
-
-# A
-# (AA <- mat.proj("PSD", p)(A))
-# mat.proj("PSD", p)(AA)
-
-
-# p <- 5
-# A <- matrix(rnorm(p^2), p)
-
-# projection <- function(T2) {
-#     P <- pinv(T2) %*% T2
-#     function(A) {
-#         matrix(P %*% as.vector(A), nrow(A))
-#     }
-# }
-
-
-# # All equal diagonal matrix, A = a * I_p
-# T2 <- t(as.vector(diag(p)))
-# proj <- projection(T2)
-
-# print.table( diag(mean(diag(A)), nrow(A), ncol(A))          , zero.print = ".")
-# print.table( matrix((pinv(T2) %*% T2) %*% as.vector(A), p)  , zero.print = ".")
-# print.table( proj(A)                                        , zero.print = ".")
-
-
-# # Diagonal matrix, A = diag(a_1, ..., a_p)
-# T2 <- matrix(seq_len(p^2), p)
-# T2 <- outer(diag(T2), as.vector(T2), `==`)
-# storage.mode(T2) <- "double"
-# proj <- projection(T2)
-
-# print.table( T2 , zero.print = ".")
-
-# print.table( diag(diag(A))                                  , zero.print = ".")
-# print.table( proj(A)                                        , zero.print = ".")
-
-
-# # Tri-Diagonal Matrix
-# T2 <- matrix(seq_len(p^2), p)
-# T2 <- outer(T2[abs(row(A) - col(A)) <= 1], as.vector(T2), `==`)
-# storage.mode(T2) <- "double"
-
-# triDiag.mask <- (abs(row(A) - col(A)) <= 1)
-# storage.mode(triDiag.mask) <- "double"
-
-# print.table( T2 , zero.print = ".")
-# print.table( triDiag.mask , zero.print = ".")
-
-# print.table( A * triDiag.mask                               , zero.print = ".")
-# print.table( matrix((pinv(T2) %*% T2) %*% as.vector(A), p)  , zero.print = ".")
-
-
-
-# # All equal main and off diagonals
-# T2 <- Reduce(rbind, Map(function(i) {
-#     as.double(row(A) == i + col(A))
-# }, (1 - p):(p - 1)))
-
-# print.table( T2 , zero.print = ".")
-
-# print.table( matrix((pinv(T2) %*% T2) %*% as.vector(A), p)  , zero.print = ".")
-
-
-# # Symmetric all equal main and off diagonals
-# T2 <- Reduce(rbind, Map(function(i) {
-#     as.double(abs(row(A) - col(A)) == i)
-# }, 0:(p - 1)))
-
-# print.table( T2 , zero.print = ".")
-
-# print.table( matrix((pinv(T2) %*% T2) %*% as.vector(A), p)  , zero.print = ".")
-
-
-# # Symetric Matrix
-# index <- matrix(seq_len(p^2), p)
-# T2 <- Reduce(rbind, Map(function(i) {
-#     e_i <- index == i
-#     as.double(e_i | t(e_i))
-# }, index[lower.tri(index, diag = TRUE)]))
-# proj <- projection(T2)
-
-# print.table( T2 , zero.print = ".")
-
-# print.table( 0.5 * (A + t(A))   , zero.print = ".")
-# print.table( proj(A)            , zero.print = ".")
-
-# proj(solve(A))
-# solve(proj(A))
-
-
-
-
 #' Specialized version of GMLM for the tensor normal model
 #'
 #' The underlying algorithm is an ``iterative (block) coordinate descent'' method
 #'
 #' @export
 gmlm_tensor_normal <- function(X, F, sample.axis = length(dim(X)),
-    max.iter = 100L, proj.betas = NULL, proj.Omegas = NULL, logger = NULL,       # TODO: proj.betas NOT USED!!!
+    max.iter = 100L, proj.betas = NULL, proj.Omegas = NULL, logger = NULL,
    cond.threshold = 25, eps = 1e-6
 ) {
    # rearrange `X`, `F` such that the last axis enumerates observations
@ -152,7 +53,7 @@ gmlm_tensor_normal <- function(X, F, sample.axis = length(dim(X)),
    }, mcov(F, sample.axis, center = FALSE))

    # initialization of betas ``covariance direction mappings``
-    betas <- Map(function(dX, dF) {
+    betas <- betas.init <- Map(function(dX, dF) {
        s <- min(ncol(dX), nrow(dF))
        crossprod(dX[1:s, , drop = FALSE], dF[1:s, , drop = FALSE])
    }, dirsX, dirsF)
@ -226,132 +127,8 @@ gmlm_tensor_normal <- function(X, F, sample.axis = length(dim(X)),
    }


-    list(eta1 = meanX, betas = betas, Omegas = Omegas)
+    structure(
+        list(eta1 = mlm(meanX, Sigmas), betas = betas, Omegas = Omegas),
+        betas.init = betas.init
+    )
 }
-
-
-
-# #### DEBUGGING
-# library(tensorPredictors)
-
-# # setup dimensions
-# n <- 1e3
-# p <- c(5, 7, 3)
-# q <- c(2, 5, 3)
-
-# max.iter <- 10L
-
-# # create "true" GLM parameters
-# eta1 <- array(rnorm(prod(p)), dim = p)
-# betas <- Map(matrix, Map(rnorm, p * q), p)
-# Omegas <- Map(function(p_j) {
-#     solve(0.5^abs(outer(seq_len(p_j), seq_len(p_j), `-`)))
-# }, p)
-# true.params <- list(eta1 = eta1, betas = betas, Omegas = Omegas)
-
-# # compute tensor normal parameters from GMLM parameters
-# Sigmas <- Map(solve, Omegas)
-# mu <- mlm(eta1, Sigmas)
-
-# # sample some test data
-# sample.axis <- length(p) + 1L
-# F <- array(rnorm(n * prod(q)), dim = c(q, n))
-# X <- mlm(F, Map(`%*%`, Sigmas, betas)) + rtensornorm(n, mu, Sigmas, sample.axis)
-
-# # setup a logging callback
-# format <- sprintf("iter: %%3d, %s, Omega: %%8.2f, resid: %%8.3f\n",
-#     paste0("beta.", seq_along(betas), ": %.3f", collapse = ", ")
-# )
-# hist <- data.frame(
-#     iter = seq(0, max.iter),
-#     dist.B = numeric(max.iter + 1),
-#     dist.Omega = numeric(max.iter + 1),
-#     resid = numeric(max.iter + 1)
-# )
-# logger <- function(iter, betas, Omegas, resid) {
-#     dist.B <- dist.kron.norm(true.params$betas, betas)
-#     dist.Omega <- dist.kron.norm(true.params$Omegas, Omegas)
-#     resid <- mean(resid^2)
-
-#     hist[iter + 1, -1] <<- c(dist.B = dist.B, dist.Omega = dist.Omega, resid = resid)
-#     cat(do.call(sprintf, c(
-#         list(format, iter),
-#         Map(dist.subspace, betas, true.params$betas),
-#         dist.Omega, resid
-#     )))
-# }
-
-# # run the model
-# est.params <- gmlm_tensor_normal(X, F, max.iter = max.iter, logger = logger)
-
-# # run model with Omegas in sub-manifolds of SPD matrices
-# est.params <- gmlm_tensor_normal(X, F, max.iter = max.iter, logger = logger,
-#     proj.Omegas = list(
-#         NULL,
-#         (function(nrow, ncol) {
-#             triDiag.mask <- (abs(.row(c(nrow, ncol)) - .col(c(nrow, ncol))) <= 1)
-#             storage.mode(triDiag.mask) <- "double"
-#             function(A) { A * triDiag.mask }
-#         })(p[2], p[2]),
-#         function(A) diag(diag(A))
-#     ))
-
-
-
-# # # Profile the fitting routine without logging
-# # profvis::profvis({
-# #     est.params <- gmlm_tensor_normal(X, F, max.iter = max.iter)
-# # })
-
-# with(hist, {
-#     plot(range(iter), range(hist[, -1]), type = "n", log = "y")
-#     lines(iter, resid, col = "red")
-#     lines(iter, dist.B, col = "green")
-#     lines(iter, dist.Omega, col = "blue")
-#     legend("topright", legend = c("resid", "dist.B", "dist.Omega"), col = c("red", "green", "blue"), lty = 1)
-# })
-
-
-# # par(mfrow = c(1, 2))
-
-# # matrixImage(Reduce(kronecker, rev(est.params$Omegas)))
-# # matrixImage(Reduce(kronecker, rev(true.params$Omegas)))
-# # # matrixImage(Reduce(kronecker, rev(est.params$Omegas)) - Reduce(kronecker, rev(true.params$Omegas)))
-
-# # par(mfrow = c(1, 2))
-
-# # matrixImage(Reduce(kronecker, rev(true.params$betas)), main = "True", col = hcl.colors(48, palette = "Blue-Red 3"))
-# # matrixImage(Reduce(kronecker, rev(est.params$betas)), main = "Est", col = hcl.colors(48, palette = "Blue-Red 3"))
-
-
-# # # unlist(Map(kappa, mcov(X)))
-# # # unlist(Map(kappa, Omegas))
-# # # unlist(Map(kappa, Sigmas))
-
-
-# # # max(unlist(Map(function(C) max(eigen(C)$values), mcov(X))))
-# # # min(unlist(Map(function(C) min(eigen(C)$values), mcov(X))))
-# # # Map(function(C) eigen(C)$values, mcov(X))
-
-# # # kappa(Reduce(kronecker, mcov(X)))
-
-# # kappa1.fun <- function(Omegas) Map(kappa, Omegas)
-# # kappa2.fun <- function(Omegas) Map(kappa, Omegas, exact = TRUE)
-# # eigen1.fun <- function(Omegas) {
-# #     Map(function(Omega) {
-# #         min_max <- range(eigen(Omega)$values)
-# #         min_max[2] / min_max[1]
-# #     }, Omegas)
-# # }
-# # eigen2.fun <- function(Omegas) {
-# #     Map(function(Omega) {
-# #         min_max <- range(eigen(Omega, TRUE, TRUE)$values)
-# #         min_max[2] / min_max[1]
-# #     }, Omegas)
-# # }
-# # microbenchmark::microbenchmark(
-# #     kappa1.fun(Omegas),
-# #     kappa2.fun(Omegas),
-# #     eigen1.fun(Omegas),
-# #     eigen2.fun(Omegas)
-# # )
--- a/tensorPredictors/R/matrixImage.R
+++ b/tensorPredictors/R/matrixImage.R
@ -19,6 +19,7 @@
 matrixImage <- function(A, add.values = FALSE,
    main = NULL, sub = NULL, interpolate = FALSE, ..., zlim = NA,
    axes = TRUE, asp = 1, col = hcl.colors(24, "Blue-Red 3", rev = FALSE),
+    col.values = par("col"), cex = 1,
    digits = getOption("digits"), new.plot = TRUE
 ) {
    # plot raster image
@ -56,6 +57,7 @@ matrixImage <- function(A, add.values = FALSE,
            A[!add.values] <- NA
            A[add.values] <- format(A[add.values], digits = digits)
        }
-        text(rep(x - 0.5, each = nrow(A)), rep(rev(y - 0.5), ncol(A)), A, adj = 0.5)
+        text(rep(x - 0.5, each = nrow(A)), rep(rev(y - 0.5), ncol(A)), A,
+             adj = 0.5, cex = cex, col = col.values)
    }
 }