fix: gmlm_chess bug (save point saved "last_loss = Inf"),

replaced self implemented SIR in sim-tsir with "do.sir" from Rdimtools, add: fen2tensor image (TikZ), add: section chess data example in paper.tex
2023-12-13 18:46:17 +01:00 · 2023-12-13 18:46:17 +01:00 · 078c406100
commit 078c406100
parent 6792cf93a9
7 changed files with 356 additions and 134 deletions
--- a/LaTeX/images/fen2tensor.tex
+++ b/LaTeX/images/fen2tensor.tex
@ -0,0 +1,78 @@
+\documentclass{standalone}
+
+\usepackage[LSB, T1]{fontenc}
+\usepackage{chessboard}
+\usepackage{skak}
+\usepackage{tikz, tikz-3dplot}
+\usepackage{amsmath}
+\usepackage{xcolor}
+
+\newcommand{\z}{{\color{gray}0}}
+
+\tdplotsetmaincoords{80}{135}
+
+\setboardfontencoding{LSB}
+
+\setchessboard{linewidth = 0.1em, showmover = false, smallboard}
+
+\newcommand{\chessplane}[2]{
+    \begin{scope}[canvas is yz plane at x={-#1 * 0.8}, transform shape]
+        \node[fill = white, opacity = 0.7, outer sep=0pt, inner sep=2pt] (layer#1) at (0, 0) {
+            \chessboard[
+                margin=false,
+                pgfstyle=text,
+                text=\textbf{1},
+                markfields={#2},
+                label=false
+            ]
+        };
+    \end{scope}
+}
+
+\begin{document}
+\begin{tikzpicture}
+
+    \begin{scope}[tdplot_main_coords, scale = 1]
+        \chessplane{12}{e8};
+        \chessplane{11}{d8};
+        \chessplane{10}{a8, h8};
+        \chessplane{9}{c8, f8};
+        \chessplane{8}{b8, g8};
+        \chessplane{7}{a7, b7, c7, d7, e7, f7, g7, h7};
+        \chessplane{6}{e1};
+        \chessplane{5}{d1};
+        \chessplane{4}{a1, h1};
+        \chessplane{3}{c1, f1};
+        \chessplane{2}{b1, g1};
+        \chessplane{1}{a2, b2, c2, d2, e2, f2, g2, h2};
+
+        \begin{scope}[canvas is yz plane at x={-1}, transform shape]
+            \node[anchor = south, rotate = 90] at (layer1.west) {Ranks / Axis 1};
+            \node[anchor = north] at (layer1.south) {Files / Axis 2};
+        \end{scope}
+
+        \coordinate (offset) at (layer1.west);
+        \newdimen\xoffset
+        \pgfextractx{\xoffset}{\pgfpointanchor{offset}{center}}
+        \begin{scope}[canvas is xz plane at y=\xoffset, transform shape, xscale=-1]
+            \path (layer1.north west) -- (layer12.north west) node[
+                pos = 0.5, anchor = south
+            ] {Pieces / Axis 3};
+        \end{scope}
+    \end{scope}
+
+    \coordinate (tensor north) at (current bounding box.north);
+
+    \node[shift = {(0, 0)}, anchor = east] (pos) at (current bounding box.west) {{
+        \setchessboard{linewidth = 0.1em, showmover = false, smallboard}
+        \newgame
+        \chessboard{}
+    }};
+
+    \node[anchor = south] at (pos.center |- tensor north) {Start Position};
+    \node[anchor = south] at (tensor north) {3D Tensor};
+
+\end{tikzpicture}
+
+
+\end{document}
--- a/LaTeX/main.bib
+++ b/LaTeX/main.bib
@ -327,12 +327,12 @@
 }

@article{sdr-mixedPredictors-BuraForzaniEtAl2022,
-  author          = {Bura and Forzani and TODO},
-  title           = {Sufficient reductions in regression with mixed predictors},
-  journal         = {},
-  volume          = {},
-  number          = {},
-  year            = {2022}
+    author      = {Bura and Forzani and TODO},
+    title       = {Sufficient reductions in regression with mixed predictors},
+    journal     = {},
+    volume      = {},
+    number      = {},
+    year        = {2022}
 }

@article{sparseIsing-ChengEtAt2014,
@ -364,3 +364,40 @@
    author        = {Kapla, Daniel},
    year          = {2019}
 }
+
+@article{MGCCA-GirkaEtAl2024,
+    title         = {Tensor generalized canonical correlation analysis},
+    author        = {Fabien Girka and Arnaud Gloaguen and Laurent {Le Brusquet} and Violetta Zujovic and Arthur Tenenhaus},
+    year          = {2024},
+    journal       = {Information Fusion},
+    volume        = {102},
+    issn          = {1566-2535},
+    doi           = {10.1016/j.inffus.2023.102045}
+}
+
+@Article{Rdimtools,
+    title         = {{Rdimtools}: An {R} Package for Dimension Reduction and Intrinsic Dimension Estimation},
+    author        = {Kisung You and Dennis Shung},
+    journal       = {Software Impacts},
+    year          = {2022},
+    volume        = {14},
+    issn          = {26659638},
+    pages         = {100414},
+    doi           = {10.1016/j.simpa.2022.100414},
+}
+
+@misc{lichess-database,
+  author       = {{Thibault Duplessis}},
+  title        = {lichess.org open database},
+  year         = {2013},
+  url          = {https://database.lichess.org},
+  note         = {visited on December 8, 2023},
+}
+
+@misc{stockfish,
+    title    = {Stockfish},
+    year     = {since 2008},
+    author   = {{The Stockfish developers (see \href{https://github.com/official-stockfish/Stockfish/blob/master/AUTHORS}{AUTHORS} file)}},
+    url      = {https://stockfishchess.org/},
+    abstract = {Stockfish is a free and strong UCI chess engine.},
+}
--- a/LaTeX/paper.tex
+++ b/LaTeX/paper.tex
@ -1,7 +1,8 @@
 \documentclass[a4paper, 10pt]{article}

 \usepackage[utf8]{inputenc}
-\usepackage[T1]{fontenc}
+\usepackage[LSF, T1]{fontenc}
+\usepackage{chessfss}
 \usepackage{fullpage}
 \usepackage{amsmath, amssymb, amstext, amsthm, scalerel, bm, pifont}
 \usepackage[dvipsnames]{xcolor}         % dvipsnames loads more named colors
@ -141,9 +142,13 @@
 \newcommand{\DiagZeroMat}[1]{\mathbb{R}_{\diag=0}^{{#1}\times {#1}}}
 \newcommand{\SymMat}[1]{\mathrm{Sym}^{{#1}\times {#1}}}
 \newcommand{\SkewSymMat}[1]{\mathrm{Skew}^{{#1}\times {#1}}}
-\newcommand{\SymPosDefMat}[1]{\mathrm{Sym}_{++}^{{#1}\times {#1}}}
+% \newcommand{\SymPosDefMat}[1]{\mathrm{Sym}_{++}^{{#1}\times {#1}}}
+\newcommand{\SymPosDefMat}[1]{\mathrm{SPD}(#1)}
 \newcommand{\SymDiagZeroMat}[1]{\mathrm{Sym}_{\diag=0}^{p\times p}}
 \newcommand{\SymBand}[2]{\mathrm{SymBand}^{{#1}\times {#1}}_{#2}}
+\newcommand{\UnitaryGrp}[1]{\mathrm{U}(#1)}
+\newcommand{\SpecialUnitaryGrp}[1]{\mathrm{SU}(#1)}
+% \newcommand{\AR}[1]{\mathrm{AR}(#1)}

 \newcommand{\todo}[1]{{\color{red}TODO: #1}}
 \newcommand{\efi}[1]{{\color{blue}Effie: #1}}
@ -572,11 +577,11 @@ Suppose $\ten{X}\mid Y = y$ follows a tensor normal distribution  with mean $\te
 \begin{displaymath}
    f_{\mat{\theta}}(\ten{X}\mid Y = y) = (2\pi)^{-p / 2}\prod_{k = 1}^{r}\det(\mat{\Sigma}_k)^{-p / 2 p_k}\exp\left( -\frac{1}{2}\left\langle\ten{X} - \ten{\mu}_y, (\ten{X} - \ten{\mu}_y)\mlm_{k = 1}^{r}\mat{\Sigma}_k^{-1} \right\rangle \right).
 \end{displaymath}
-For the sake of simplicity and w.l.o.g., we assume $\ten{X}$ has 0 marginal expectation; i.e., $\E\ten{X} = 0$. Rewriting this in the quadratic exponential family form \eqref{eq:quadratic-exp-fam}, determines the scaling constant $c = -1/2$. The relation to the GMLM parameters $\overline{\ten{\eta}}, \mat{\beta}_k$ and $\mat{\Omega}_k$, for $k = 1, \ldots, r$ is
-\begin{displaymath}
+For the sake of simplicity and w.l.o.g., we assume $\ten{X}$ has 0 marginal expectation; i.e., $\E\ten{X} = 0$. Rewriting this in the quadratic exponential family form \eqref{eq:quadratic-exp-fam}, determines the scaling constant $c = -1/2$. The relation to the GMLM parameters $\mat{\beta}_k$ and $\mat{\Omega}_k$, for $k = 1, \ldots, r$ is
+\begin{equation}\label{eq:tnormal_cond_params}
    \ten{\mu}_y = \ten{F}_y\mlm_{k = 1}^{r}\mat{\Omega}_k^{-1}\mat{\beta}_k, \qquad
    \mat{\Omega}_k = \mat{\Sigma}_k^{-1},
-\end{displaymath}
+\end{equation}
 where we used that $\overline{\ten{\eta}} = 0$ due to $0 = \E\ten{X} = \E\E[\ten{X}\mid Y] = \E\ten{\mu}_Y$ in combination with $\E\ten{F}_Y = 0$. Additionally, all the $\mat{\Omega}_k$'s are symmetric positive definite, because the $\mat{\Sigma}_k$'s are. This lead to another simplification since then $\mat{T}_2$ in \eqref{eq:t-stat} equals the identity. This also means that the gradients of the log-likelihood $l_n$ in \cref{thm:grad} are simpler. We obtain
 \begin{displaymath}
    \ten{g}_1(\mat{\eta}_y) = \E[\ten{X}\mid Y = y] = \ten{\mu}_y, \qquad
@ -661,7 +666,7 @@ We extend the conditional PMF by allowing the binary variables to be tensor valu
        &= p_0(\mat{\gamma}_y)\exp(\t{\vech((\vec{\ten{X}})\t{(\vec{\ten{X}})})}\mat{\gamma}_y) \nonumber \\
        &= p_0(\mat{\gamma}_y)\exp\Bigl(\Bigl\langle \ten{X}, \ten{F}_y\mlm_{k = 1}^{r}\mat{\beta}_k \Bigr\rangle + \Bigl\langle\ten{X}\mlm_{k = 1}^{r}\mat{\Omega}_k, \ten{X}\Bigr\rangle\Bigr)\label{eq:ising-cond-prob}
 \end{align}
-where we set $\overline{\ten{\eta}} = 0$ and $\mat{T}_2$ to the identity. This is an additional constraint to the model, the reason is that the diagonal elements of $\mat{\Omega} = \bigkron_{k = r}^{1}\mat{\Omega}_k$ take the role of $\overline{\ten{\eta}}$, althoug not fully. Having the diagonal of $\mat{\Omega}$ and $\overline{\ten{\eta}}$ handling the self interaction effects might lead to interference in the optimization routine. Another approach would be to use the $\mat{T}_2$ matrix to set the corresponding diagonal elements of $\mat{\Omega}$ to zero and let $\overline{\ten{\eta}}$ handle the self interaction effect. All of those approaches, namely setting $\overline{\ten{\eta}} = 0$, keeping $\overline{\ten{\eta}}$ or using $\mat{T}_2$, are theoretically solid and compatible with \cref{thm:grad,thm:param-manifold,thm:asymptotic-normality-gmlm}, assuming all axis dimensions $p_k$ are non-degenerate, that is $p_k > 1$ for all $k = 1, \ldots, r$. Regardles, under our modeling choise we get the relation between the natural parameters $\mat{\gamma}_y$ of the conditional Ising model and the GMLM parameters $\mat{\beta}_k$ and $\mat{\Omega}_k$ as
+where we set $\overline{\ten{\eta}} = 0$ and $\mat{T}_2$ to the identity. This is an additional constraint to the model, the reason for $\overline{\ten{\eta}} = 0$ is that the diagonal elements of $\mat{\Omega} = \bigkron_{k = r}^{1}\mat{\Omega}_k$ take the role of $\overline{\ten{\eta}}$, althoug not fully. Having the diagonal of $\mat{\Omega}$ and $\overline{\ten{\eta}}$ handling the self interaction effects might lead to interference in the optimization routine. Another approach would be to use the $\mat{T}_2$ matrix to set the corresponding diagonal elements of $\mat{\Omega}$ to zero and let $\overline{\ten{\eta}}$ handle the self interaction effect. In \cref{sec:chess} we actually use $\mat{T}_2$ to enforce zero constants for specific elments of $\mat{\Omega}$. But for simplicity, we set $\mat{T}_2$ to the identity in the following. All of those approaches, namely setting $\overline{\ten{\eta}} = 0$, keeping $\overline{\ten{\eta}}$ or using $\mat{T}_2$, are theoretically solid and compatible with \cref{thm:grad,thm:param-manifold,thm:asymptotic-normality-gmlm}, assuming all axis dimensions $p_k$ are non-degenerate, that is $p_k > 1$ for all $k = 1, \ldots, r$. Regardles, under our modeling choise we get the relation between the natural parameters $\mat{\gamma}_y$ of the conditional Ising model and the GMLM parameters $\mat{\beta}_k$ and $\mat{\Omega}_k$ as
 \begin{equation}\label{eq:ising-natural-params}
    % \t{\pinv{\mat{D}_p}}\mat{\gamma}_y
    %     = \vec(\mat{\Omega} + \diag(\mat{B}\vec{\ten{F}_y}))
@ -723,7 +728,7 @@ The same problem arives in gradient optimization. Therefore, before starting the
 \subsubsection{Slightly Bigger Dimensions}\label{sec:ising-bigger-dim}
 A big challange with the Ising model is its high computational complexity. The reason is in the sum over all binary vectors of length $p = \prod_{k = 1}^{r}p_k$ in the partition function \eqref{eq:ising-partition-function}. Computing the partition function exactly requires to sum all $2^p$ binary vectors. Small dimensions, like $p\approx 10$, this is easily computed. Increasing the dimension bejond $20$ gets extremely expensive while for dimensions bigger than $30$ its absolutely impossible. Trying to avoid the evaluation of the log-likelihood and only computing its partial gradients via \cref{thm:grad} does not resolve the issue. The gradients require the inverse link, in other words the second moment \eqref{eq:ising-m2}, where, if dropping the scaling factor $p_0$, still involves to sum $2^p$ summands. Basically, with our model, this means that the optimization of the Ising model using exactly computed gradients is impossible for moderately sized problems.

-For estimation of dimensions $p$ bigger than $20$, we use a Monte-Carlo method to estimate the second moment \eqref{eq:ising-m2}, required to compute the partial gradients of the log-likelihood. Specifically, we use a Gibbs-Sampler to sample from the conditional distribution and approximate the second moment in an importance sampling framework. This can be implemented quite efficiently while the estimation accuracy for the second moment is evaluated experimentaly which seems to be very reliable. simultaniously, we use the same approach to estimate the partition funciton. This though, is in comparison inaccurate, and may only be used to get a rough idea of the log-likelihood. Regardles, for our method, we only need the gradient for optimization where appropriate break conditions, not based on the likelihood, lead to a working method for MLE estimation.
+For estimation of dimensions $p$ bigger than $20$, we use a Monte-Carlo method to estimate the second moment \eqref{eq:ising-m2}, required to compute the partial gradients of the log-likelihood. Specifically, we use a Gibbs-Sampler to sample from the conditional distribution and approximate the second moment in an importance sampling framework. This can be implemented quite efficiently while the estimation accuracy for the second moment is evaluated experimentaly which seems to be very reliable. Simultaniously, we use the same approach to estimate the partition function. %This though, is in comparison inaccurate, and may only be used to get a rough idea of the log-likelihood. Regardles, for our method, we only need the gradient for optimization where appropriate break conditions, not based on the likelihood, lead to a working method for MLE estimation.

 \begin{figure}
    \centering
@ -799,7 +804,7 @@ As a basis to ensure that the constraint parameter space $\Theta$ is a manifold,
 \end{theorem}

 \subsection{Matrix Manifolds}\label{sec:matrix-manifolds}
-A powerful side effect of \cref{thm:param-manifold} is the modeling flexibinity it provides. For example, we can perform low rank regression. Or, we may constrain two-way interactions between direct axis neighbors by using band matrices for the $\mat{\Omega}_k$'s, among others.
+A powerful side effect of \cref{thm:param-manifold} is the modeling flexibility it provides. For example, we can perform low rank regression. Or, we may constrain two-way interactions between direct axis neighbors by using band matrices for the $\mat{\Omega}_k$'s, among others.

 This flexibility derives from many different matrix manifolds that can be used as building blocks $\manifold{B}_k$ and $\manifold{O}_k$ of the parameter space $\Theta$ in \cref{thm:param-manifold}. A list of possible choices, among others, is given in \cref{tab:matrix-manifolds}. As long as parameters in $\Theta$ are valid paramererization of a density (or PMF) of \eqref{eq:quadratic-exp-fam} subject to \eqref{eq:eta1-manifold} and \eqref{eq:eta2-manifold}, one may choose any of the manifolds listed in \cref{tab:matrix-manifolds} which are ether cones or spherical. We also included an example which is nether a sphere nor a cone. They may also be valid building blocks, but require more work as they are not directly leading to a parameter manifold by \cref{thm:param-manifold}. In case one can show the resulting parameter space $\Theta$ is an embedded manifold, the asymtotic theory of \cref{sec:asymtotics} is applicable.

@ -809,22 +814,22 @@ This flexibility derives from many different matrix manifolds that can be used a
        Symbol & Description & C & S & Dimension\\ \hline
        $\mathbb{R}^{p\times q}$ & All matrices of dimension $p\times q$ &
            \checkmark & \xmark     & $p q$ \\ \hline
-        & Full rank $p\times q$ matrices &
+        $\mathbb{R}_{*}^{p\times q}$ & Full rank $p\times q$ matrices &
            \checkmark & \xmark     & $p q$                  \\ \hline
-        $\mathrm{St}(p, q)$ & \emph{Stiefel Manifold}, $\{ \mat{U}\in\mathbb{R}^{p\times q} : \t{\mat{U}}\mat{U} = \mat{I}_q \}$ for $q\leq p$ &
+        $\Stiefel{p}{q}$ & \emph{Stiefel Manifold}, $\{ \mat{U}\in\mathbb{R}^{p\times q} : \t{\mat{U}}\mat{U} = \mat{I}_q \}$ for $q\leq p$ &
            \xmark     & \checkmark & $p q - q (q + 1) / 2$  \\ \hline
        $\mathcal{S}^{p - 1}$ & Unit sphere in $\mathbb{R}^p$, special case $\Stiefel{p}{1}$ &
            \xmark     & \checkmark & $p - 1$                \\ \hline
-        $U(p)$ & Unitary Group, special case $\Stiefel{p}{p}$ &
+        $\UnitaryGrp{p}$ & Unitary Group, special case $\Stiefel{p}{p}$ &
            \xmark     & \checkmark & $p (p - 1) / 2$        \\ \hline
-        $SU(p)$ & Special Unitary Group $\{ \mat{U}\in U(p) : \det{\mat{U}} = 1 \}$ &
+        $\SpecialUnitaryGrp{p}$ & Special Unitary Group $\{ \mat{U}\in U(p) : \det{\mat{U}} = 1 \}$ &
            \xmark     & \checkmark & $p (p - 1) / 2$        \\ \hline
-        & Matrices of known rank $r > 0$, generalizes $\StiefelNonCompact{p}{q}$ &
+        $\mathbb{R}_{r}^{p\times q}$ & Matrices of known rank $r > 0$, generalizes $\StiefelNonCompact{p}{q}$ &
            \checkmark & \xmark     & $r(p + q - r)$         \\ \hline
        & Symmetric matrice &
-            \checkmark & \xmark     & $\frac{1}{2}p(p+1)$    \\ \hline
-        $SPD(p)$ & Symmetric Positive Definite matrices &
-            \checkmark & \xmark     & $\frac{1}{2}p(p+1)$    \\ \hline
+            \checkmark & \xmark     & $p (p + 1) / 2$    \\ \hline
+        $\SymPosDefMat{p}$ & Symmetric Positive Definite matrices &
+            \checkmark & \xmark     & $p (p + 1) / 2$    \\ \hline
        & Scaled Identity $\{ a\mat{I}_p : a\in\mathbb{R}_{+} \}$ &
            \checkmark & \xmark     & $1$    \\ \hline
        & Symmetric $r$-band matrices (includes diagonal) &
@ -874,10 +879,6 @@ It is not necessary to have a perfect maximizer, as long as the objective has fi
 \end{theorem}


-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\subsection{Asymptotic Normality}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
 The following is a reformulation of \textcite[Lemma~2.3]{asymptoticMLE-BuraEtAl2018} which assumes Condition~2.2 to hold. But the existence of a mapping in Condition~2.2 is not needed for Lemma~2.3. It suffices that the restricted parameter space $\Theta$ is a subset of the unrestricted parameter space $\Xi$, which is trivially satisfied in our setting. Under this, \cref{thm:exists-strong-M-estimator-on-subsets} follows directly from \textcite[Lemma~2.3]{asymptoticMLE-BuraEtAl2018}.

 \begin{theorem}[Existence of strong M-estimators on Subsets]\label{thm:exists-strong-M-estimator-on-subsets}
@ -920,21 +921,42 @@ The following is a reformulation of \textcite[Lemma~2.3]{asymptoticMLE-BuraEtAl2
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \section{Simulations}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-We compair our method with a few other methods for the tensor normal and the Ising model (inverse Ising problem).
+In this section we provide simulation results for the tensor normal as well as the Ising model where different aspects of the GMLM model are compaired against other methods. The comparison methods are TSIR \textcite{tsir-DingCook2015}, MGCCA \textcite{MGCCA-GirkaEtAl2024} and HOPCA \todo{cite?!} for both continuous and binary data. In case of binary data, the binary values are simply treated as continuous. As a base line we also include classic PCA on vectorized observations. \todo{check, fix, ...}
+
+All experiments are performed with different sample sizes $n = 100, 200, 300, 500$ and $750$. Every experiment is repeated $100$ times.
+
+We are interested in the quality of the estimate of the true sufficient reduction $\ten{R}(\ten{X})$ from \cref{thm:sdr}. Therefore, we compair against the true vectorized reduction matrix $\mat{B} = \bigkron_{k = r}^{1}\mat{\beta}_k$, since the vectorized version is compatible with any linear reduction method. The distance $d(\mat{B}, \hat{\mat{B}})$ between $\mat{B}\in\mathbb{R}^{p\times q}$ and an estimate $\hat{\mat{B}}\in\mathbb{R}^{p\times \tilde{q}}$ is the \emph{subspace distance} which is proportional to
+\begin{displaymath}
+    d(\mat{B}, \hat{\mat{B}}) \propto \| \mat{B}\pinv{(\t{\mat{B}}\mat{B})}\t{\mat{B}} - \hat{\mat{B}}\pinv{(\t{\hat{\mat{B}}}\hat{\mat{B}})}\t{\hat{\mat{B}}} \|_F.
+\end{displaymath}
+That is the Frobenius norm of the diffeence between the projections onto the span of $\mat{B}$ and $\hat{\mat{B}}$. The proportionality constant\footnote{Depends on row dimension $p$ and the ranks of $\mat{B}$ and $\hat{\mat{B}}$ given by $(\min(\rank\mat{B} + \rank\hat{\mat{B}}, 2 p - (\rank\mat{B} + \rank\hat{\mat{B}})))^{-1/2}$.} of $d(\mat{B}, \hat{\mat{B}})$ ensures that the subspace distance is in the interval $[0, 1]$. A distance of zero implies equality of the spans, a distance of one means that the subspaces are orthogonal.

 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{Tensor Normal}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+For every tensor normal model we draw i.i.d. samples $\ten{X}_i$ for $i = 1, ..., n$ from the conditional distribution of $\ten{X}\mid Y = y_i$ where $y_i$ is an i.i.d. sample from the standard normal distribution. The conditional distribution $\ten{X}\mid Y = y_i$ depends on the choice of the GMLM parameters $\overline{\ten{\eta}}$, $\mat{\beta}_1, ..., \mat{\beta}_r$, $\mat{\Omega}_1, ..., \mat{\Omega}_r$, and the function $\ten{F}_y$ of $y$. In all experiments we set $\overline{\ten{\eta}} = \mat{0}$. The other parameters and $\ten{F}_y$ are described per experiment. With the true GMLM parameters and $\ten{F}_y$ given, we compute the conditional tensor normal mean $\ten{\mu}_y = \ten{F}_y\mlm_{k = 1}^{r}\mat{\Omega}_k^{-1}\mat{\beta}_k$ and covariances $\mat{\Sigma}_k = \mat{\Omega}_k^{-1}$ as in \eqref{eq:tnormal_cond_params}.

+We start with a $1$ dimensional linear dependence on $y$ in 1a). Then, the dependence of $y$ is via a cubic polynomial 1b-d). In 1b) reduction is full rank, in constrast to 1c) where the $\mat{\beta}_k$'s are of rank $1$, in other words, low rank regression. In 1d) we constrain the inverse covariances $\mat{\Omega}_k$ to be tri-diagonal. Both, 1c-d) are examples of building the parameter space according to \cref{thm:param-manifold}. The final tensor normal experiment 1e) is a model missspecification. The true model does \emph{not} have a Kronecker structure and the ``known'' function $\ten{F}_y$ of $y$ is missspecified as well.

 \begin{itemize}
-    \item[a] A simple setup with linear relation for 
-    \item[b] 
-    \item[c] 
-    \item[d] 
-    \item[e] Missspecified model .......
+    \item[1a)] The predictors $\ten{X}$ are $2\times 3\times 5$ dimensional, that is $r = 3$. The dependence through the inverse regression model is linear specifically means that $\ten{F}_y\equiv y$ is a $1\times 1\times 1$ tensor. The true $\mat{\beta}_k$'s are all equal to $\mat{e}_1\in\mathbb{R}^{p_k}$, the first unit vector, for $k \in \{1, 2, 3\}$. The matrices $\mat{\Omega}_k = \mathrm{AR}(0.5)$ follow an auto-regression like structure. That is, the elements are given by $(\mat{\Omega}_k)_{i j} = 0.5^{|i - j|}$.
+    \item[1b)] The predictors $\ten{X}$ are again $3$ dimensional with dimension $2\times 3\times 5$ which relates to the response $y$ via a qubic polynomial. This is modeled via $\ten{F}_y$ of dimension $2\times 2\times 2$ by the twice iterated outer product of the vector $(1, y)$. Element wise this reads $(\ten{F}_y)_{i j k} = y^{i + j + k - 3}$. All $\mat{\beta}_k$'s are set to $(\mat{e}_1, \mat{e}_2)\in\mathbb{R}^{p_k\times 2}$ with $\mat{e}_i$ the $i$'th unit vector and the $\mat{\Omega}_k$'s are $\mathrm{AR}(0.5)$.
+    \item[1c)] Similar to 1b), except that the GMLM parameters $\mat{\beta}_k$ are rank $1$ given by
+        \begin{displaymath}
+            \mat{\beta}_1 = \begin{pmatrix} 1 & -1 \\ -1 & 1 \end{pmatrix},\quad
+            \mat{\beta}_2 = \begin{pmatrix} 1 & -1 \\ -1 & 1 \\ 1 & -1 \end{pmatrix},\quad
+            \mat{\beta}_3 = \begin{pmatrix} 1 & -1 \\ -1 & 1 \\ 1 & -1 \\ -1 & 1 \\ 1 & -1 \end{pmatrix}.
+        \end{displaymath}
+    \item[1d)] Again like 1b). This time the true $\mat{\Omega}_k$'s, for $k = 1, 2, 3$, are tri-diagonal. Their elements are given by $(\mat{\Omega}_k)_{i j} = \delta_{0, |i - j|} + 0.5\delta_{1, |i - j|}$ with $\delta_{i, j}$ being the Kronecker delta.
+    \item[1e)] For the missspecification model we let $\ten{X}\mid Y$ be multivariate normal but \emph{not} tensor normal. Let $\ten{X}$ be $5\times 5$ dimensional, $Y$ is univariate standard normal and $\mat{f}_y$ is a $4$ dimensional vector given by $\mat{f}_y = (1, \sin(y), \cos(y), \sin(y)\cos(y))$. The true vectorized reduction matrix $\mat{B}$ is $25\times 4$ consisting of the first $4$ columns of the identify. The variance-covariance matrix $\mat{\Sigma}$ is an auto-regression like structure with correlation coefficient $0.5$. Element wise $\mat{B}_{i j} = \delta_{i j}$ and $\mat{\Sigma}_{i j} = 0.5^{|i - j|}$. Both, $\mat{B}$ and $\mat{\Omega} = \mat{\Sigma}^{-1}$ violate the Kronecker product assumptions \eqref{eq:eta1} and \eqref{eq:eta2} of the GMLM model. Then, we set
+    \begin{displaymath}
+        \vec{\ten{X}}\mid (Y = y) = \mat{B}\mat{f}_y + \mathcal{N}_{25}(\mat{0}, \mat{\Sigma}).
+    \end{displaymath}
+    Furthermore, we fit the model with the wrong ``known'' function $\ten{F}_y$. We set $\ten{F}_y$ to be a $2\times 2$ matrix with a quadratic linkage via elements given by $(\ten{F}_y)_{i j} = y^{|i - j|}$.
 \end{itemize}

+
+
 \begin{figure}
    \centering
    \includegraphics[width = \textwidth]{plots/sim-normal.pdf}
@ -942,7 +964,7 @@ We compair our method with a few other methods for the tensor normal and the Isi
 \end{figure}


-\begin{figure}
+\begin{figure}[ht!]
    \centering
    \includegraphics[width = \textwidth]{plots/sim-tsir.pdf}
    \caption{\label{fig:sim-tsir}Simulation to investiage the unexpected failure of TSIR in simulation 1c.}
@ -955,7 +977,7 @@ We compair our method with a few other methods for the tensor normal and the Isi

 \begin{figure}
    \centering
-    \includegraphics[]{plots/sim-ising.pdf}
+    \includegraphics[width = \textwidth]{plots/sim-ising.pdf}
    \caption{\label{fig:sim-ising}asknclknasknc}
 \end{figure}

@ -972,52 +994,34 @@ We compair our method with a few other methods for the tensor normal and the Isi
    \caption{\label{tab:sim-ising}xyz uvw}
 \end{table}

-% The \emph{tensor normal distribution} $\mathcal{TN}(\ten{\mu}, \mat{\Sigma}_1, ..., \mat{\Sigma}_r)$ is a generalization of the \emph{matrix normal distribution} $\mathcal{MN}(\mat{\mu}, \mat{\Sigma}_1, \mat{\Sigma}_2)$.  \todo{ref} The density of the conditional tensor normal distribution $\ten{X}\mid Y = y$ according to the quadratic exponential family \eqref{eq:quadratic-exp-fam} where only the first moment depends on $y$ is given by
-% \begin{displaymath}
-%     f_{\mat{\theta}}(\ten{X}\mid Y = y) = (2\pi)^{-p/2}\prod_{k = 1}^r |\mat{\Sigma}_{k}|^{-p / 2 p_{k}}\exp\Big(
-%         -\frac{1}{2}\Big\langle \ten{X} - \ten{\mu}_y, (\ten{X} - \ten{\mu}_y)\mlm_{k\in[r]}\mat{\Sigma}_{k} \Big\rangle
-%     \Big)
-% \end{displaymath}
-% Rewriting this in the form of an exponential family as in \eqref{eq:quadratic-exp-fam} allows to determin the natural parameter components $\mat{\eta}_{y1}$ and $\mat{\eta}_2$. Since a non-degenerate normal distribution requires the covariance matrices $\mat{\Sigma}_k$ to be symmetric positive definite the relation to the second moment natural parameter $\mat{\eta}_2$ simplifies as we can set $\mat{T}_2$ in \eqref{eq:eta2-manifold} to the identity. This then gives the relation to the natural parameters as in \eqref{eq:eta1-manifold} and \eqref{eq:eta2-manifold} as
-% \begin{displaymath}
-%     \mat{\eta}_{1y} = \vec\Bigl(\ten{\mu}_y\mlm_{k = 1}^{r}\mat{\Sigma}_k\Bigr), \qquad
-%     \mat{\eta}_2    = c\t{\mat{D}_p}\vec\bigkron_{k = r}^{1}\mat{\Sigma}_k^{-1}
-% \end{displaymath}
-% with scaling constant $c = -1 / 2$. Modeling the natural parameters as in \eqref{eq:eta1} and \eqref{eq:eta2} relates the mean $\ten{\mu}_y$ and the covariance matrices $\mat{\Sigma}_k$ of the tensor normal to the generalized multi-linear model parameter $\overline{\ten{\eta}}$ and $\mat{\beta}_k$, $\mat{\Omega}_k$, for $k = 1, \ldots, r$ through
-% \begin{displaymath}
-%     \ten{\mu}_y = \Bigl(\overline{\ten{\eta}} + \ten{F}_y\mlm_{j = 1}^{r}\mat{\beta}_j\Bigr)\mlm_{k = 1}^{r}\mat{\Omega}_k^{-1}, \qquad \mat{\Omega}_k = \mat{\Sigma}_k^{-1}.
-% \end{displaymath}
-% This completely determines the tensor normal distribution given the GMLM parameter.


+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\section{Data Analysis}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+In this section be perform two \todo{realy two!} applications of the GMLM model on real data. First example is the tensor normal model applied to EED data\todo{do this!}. Next, we perform a prove of concept data analysis example for chess. The main purpose of choosing chess is two fold. First, we can ilustrate an explicit use case for the (till now ignored) linear constraint matrix $\mat{T}_2$. Second, its a personal interest of one of authors.\todo{???}


-% To estimate the GMLM parameters $\mat{\theta} = (\overline{\ten{\eta}}, \mat{\beta}_1, \ldots, \mat{\beta}_r, \mat{\Omega}_1, \ldots\mat{\Omega}_r)$ given a data set $(\ten{X}_i, y_i)$ of $i = 1, \ldots, n$ observation we use the gradients provided by \cref{thm:grad}. It turns out that the equations $\nabla_{\overline{\ten{\eta}}}l_n = 0, \nabla_{\mat{\beta}_j}l_n = 0$ and $\nabla_{\mat{\Omega}_j}l_n = 0$, for $j = 1, \ldots, r$, can be solved for the differentiation variable assuming all the other parameter blocks to be constant. Centering the observed $\ten{X}$ leads to a cleaner formulation given by  \todo{fix the following!}
-% \begin{align*}
-%     \hat{\overline{\ten{\eta}}} &= \frac{1}{n}\sum_{i = 1}^{n} \ten{X}_i, \\
-%     \t{\hat{\mat{\beta}}_j} &=  \mat{\Omega}_j \Bigl(\Bigl(\ten{F}_{y_i}\mlm_{k \neq j}\mat{\Omega}_k^{-1}\mat{\beta}_k\Bigr)_{(j)}\t{\Bigl(\ten{F}_{y_i}\mlm_{k \neq j}\mat{\beta}_k\Bigr)_{(j)}}\Bigr)^{-1}\Bigl(\ten{F}_{y_i}\mlm_{k \neq j}\mat{\beta}_k\Bigr)_{(j)}\ten{X}_{(j)}, \\
-%     \t{\Omega}_j &= scaling \frac{1}{n}
-% \end{align*}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\subsection{EEG}

-% This allows to use a \emph{block coordinate descent} method instead of gradient descent. This method keeps all but one parameter block fixed and optimized the objective for a single block. Given a closed form solution for the partial gradients, only a single update is required to solve the partial optimization problem. This means that the block coordinate descent method reduces to a cyclic updating. This not only converges very fast, it also does not require any hyper parameters.
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\subsection{Chess}\label{sec:chess}
+The data set is provided by the \citetitle{lichess-database}\footnote{\fullcite{lichess-database}}. We downloaded November of 2023 consisting of more than $92$ million games. We removed all games without Stockfish\footnote{\fullcite{stockfish}} position evaluations. Those take the role of the response $Y$ and correspond to a winning probability from whites point of few. Positive scores are good for white and negative scores indicate an advantage for black. We ignore all highlty unballanced positions, which we set to be positions with absolute score above $5$. We also remove all positions with a mate score (one side can force check mate). Finally, we only considure positions with white to move. This leads to a final data set of roughly $190$ million positions, including duplicates.

-% For this iterative scheme we do need some initial estimates. For those we 
+A chess position is encoded as a 3D binary tensor of dimension $8\times 8\times 12$ giving the predictors $\ten{X}$, see \cref{fig:fen2tensor}. The first two axis encode the squares of a chess board, which is a $8\times 8$ grid. The third axis encodes chess pieces. The $12$ pieces derive from the $6$ types of pieces, namely pawns (\pawn), knights (\knight), bishops (\bishop), queens (\queen) and kings (\king) of two colors, black and white.

+\begin{figure}[hp!]
+    \centering
+    \includegraphics[width = \textwidth]{images/fen2tensor.pdf}
+    \caption{\label{fig:fen2tensor}The chess start position and its 3D binary tensor representation, empty entries are $0$.}
+\end{figure}

+Now, we assume that $(\ten{X}, y)$ follows the Ising GMLM model \cref{sec:ising_estimation}. The relation to the scores $y$ is modeled via a cubic relation by the $2\times 2\times 2$ dimensional tensor $\ten{F}_y$ with elements $(\ten{F}_y)_{i j k} = y^{i + j + k - 3}$.

-% \subsection{Matrix Normal and Ising Model}
+Due to the rules of chess, given any encoded legal position $\ten{X}$, there are some elements which are never set. This are the elements corresponding to pawns on the 1st and 8th rank. This implies that the matrix $\mat{T}_2$ in \eqref{eq:t-stat} under model \eqref{eq:quadratic-exp-fam} can \emph{not} be the identity as, for simplicity, assumed in \cref{sec:ising_estimation}. The addaptation in this case is simple. We only need to enforce the corresponding diagonal elements in $\mat{\Omega}$ to be zero, which is a linear operation. The only difference in the estimation procedure is to use $\ten{G}_2$ in \cref{thm:grad} for gradient computations. This can be extended to incorporate additional constraints into the model derived from the game of chess. First, all interactions of pieces with pawns on the 1st or 8th rank are impossible. Next, there is always exactly one king per color on the board and they can not be on neighboring squares. Finally, only one piece can occupy a square. All those conditions can be writen as a linear constraint on $\mat{\Omega}$ by setting the corresponding entries in $\mat{\Omega}$ to zero.

-% If $\mat{X} \in \mathbb{R}^{p_1 \times p_2}$, then $\mat{\theta} = (\overline{\eta}, \mat{\beta}_1, \mat{\beta}_2, \mat{\Omega}_1, \mat{\Omega}_2)$ and $\mat{F}_y$ is also matrix valued. The conditional pdf of $\mat{X}\mid Y$ is
-% \begin{align*}
-%     f_{\mat{\theta}}(\mat{X}\mid Y = y)
-%         &= h(\mat{X})\exp(\langle\mat{X}, \mat{\eta}_1(\mat{\theta})\rangle + \langle\mat{X}\circ\mat{X}, \mat{\eta}_2(\mat{\theta})\rangle - b(\mat{\eta}_y(\mat{\theta}))) \\
-%         &= h(\mat{X})\exp(\tr((\overline{\mat{\eta}} + \mat{\beta}_1\mat{F}_y\t{\mat{\beta}_2})\t{\mat{X}}) + \tr(\mat{\Omega}_1\mat{X}\mat{\Omega}_2\t{\mat{X}}) - b(\mat{\eta}_y(\mat{\theta}))).
-% \end{align*}
-% The MLE estimate $\hat{\mat{\theta}}_n = (\widehat{\overline{\mat{\eta}}}, \widehat{\mat{\beta}}_2\otimes \widehat{\mat{\beta}}_1, \widehat{\mat{\Omega}}_2\otimes \widehat{\mat{\Omega}}_2)$ is asymptotically normal and
-% \begin{displaymath}
-%     \widehat{\ten{R}}(\mat{X}) = \t{(\widehat{\mat{\beta}}_2\otimes \widehat{\mat{\beta}}_1)}\vec(\mat{X} - \E\mat{X}) \equiv \t{\widehat{\mat{\beta}}_1}(\mat{X} - \E\mat{X})\widehat{\mat{\beta}}_2
-% \end{displaymath}
-% is the MLE of the sufficient reduction $\ten{R}(\mat{X})$ of dimension $q_1 \times q_2 \leq p_1 \times p_2 $.\\[1.6em]
+By the raw scale of the data, millions of observations, it is computationaly infeasible to compute the gradients on the entire data set. Simply using a computationaly managable subset is not an option. The problem is that the Ising model is of dimension $8\cdot 8\cdot 12 = 768$ which requires an absurd amount of data to get a reasonable picture of the interaction effects. The solution is to switch from a classic gradient based optimization to a stochastic version. This means that every gradient update uses a new random subset of the entire data set. Therefore, we draw independent random samples of the data. The independence of samples derived from the independence of games, and every sample is drawn from a different game.

 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \appendix
--- a/LaTeX/plots/aggr-tsir.csv
+++ b/LaTeX/plots/aggr-tsir.csv
@ -1,55 +1,55 @@
 rho order beta.version dist.subspace.gmlm dist.subspace.tsir dist.subspace.sir
-0 2 1 0.05706344996 0.04870725477 0.02949768844
-0.1 2 1 0.06099344597 0.0544879924 0.1443251451
-0.2 2 1 0.06006975287 0.04853609519 0.278326568
-0.3 2 1 0.06258542237 0.0577666547 0.398026673
-0.4 2 1 0.0630860656 0.06434322354 0.50784661
-0.5 2 1 0.0681407852 0.06660780135 0.600152663
-0.6 2 1 0.0704756441 0.08007941831 0.678782131
-0.7 2 1 0.07399226468 0.09624028404 0.741513693
-0.8 2 1 0.08696351437 0.1434422575 0.792636477
-0 3 1 0.03884761964 0.05075637043 0.01559641056
-0.1 3 1 0.04181920387 0.04806015013 0.172176562
-0.2 3 1 0.04446027941 0.05200675213 0.33300157
-0.3 3 1 0.04431383228 0.06116237486 0.477540231
-0.4 3 1 0.04913731328 0.0818369603 0.599066199
-0.5 3 1 0.0508888889 0.109494101 0.698420153
-0.6 3 1 0.0650707898 0.164995207 0.776338247
-0.7 3 1 0.0722256221 0.2796157782 0.83533372
-0.8 3 1 0.111673742 0.5800304654 0.879376086
-0 4 1 0.0310105382 0.0730134344 0.0071848475
-0.1 4 1 0.03266222395 0.0768687025 0.197565596
-0.2 4 1 0.03283241074 0.0918531581 0.381220751
-0.3 4 1 0.03606864228 0.1271383726 0.539976855
-0.4 4 1 0.0439046288 0.1889255293 0.669113261
-0.5 4 1 0.0541551555 0.300616954 0.768377223
-0.6 4 1 0.06073810269 0.518481551 0.841219015
-0.7 4 1 0.08029184283 0.844190422 0.892810124
-0.8 4 1 0.1426333851 0.97160198 0.928311852
-0 2 2 0.04698636073 0.0520330931 0.0149037134
-0.1 2 2 0.04930602754 0.05971055866 0.01176055483
-0.2 2 2 0.0533056929 0.06564519596 0.00926994104
-0.3 2 2 0.05912356401 0.08914725578 0.007123095258
-0.4 2 2 0.07063246582 0.11735093643 0.00570685511
-0.5 2 2 0.08335540383 0.1632686417 0.003445887997
-0.6 2 2 0.09303190179 0.2411664985 0.002556417498
-0.7 2 2 0.12298692 0.3419166697 0.001596502137
-0.8 2 2 0.1628933779 0.574004413 0.000854865949
-0 3 2 0.0378753001 0.0922734022 0.00524888557
-0.1 3 2 0.03919134772 0.1392374104 0.00405657452
-0.2 3 2 0.0512381601 0.2187110985 0.002917695203
-0.3 3 2 0.05320741753 0.339382769 0.001845247336
-0.4 3 2 0.05652189257 0.476061088 0.00128800652
-0.5 3 2 0.0782460901 0.653943167 0.000821208511
-0.6 3 2 0.0986561143 0.834839108 0.000482747573
-0.7 3 2 0.1380699767 0.967224131 0.000250640226
-0.8 3 2 0.1784597136 0.997450021 0.000106823797
-0 4 2 0.03222682489 0.2450182862 0.00188728903
-0.1 4 2 0.03486174389 0.4142114066 0.00115216396
-0.2 4 2 0.03931006383 0.669792609 0.00078017042
-0.3 4 2 0.0481088931 0.853140232 0.00048375608
-0.4 4 2 0.0598583401 0.968730327 0.000294225075
-0.5 4 2 0.0747933415 0.99692433 0.0001634921303
-0.6 4 2 0.1156758178 0.999887965 8.50620125e-05
-0.7 4 2 0.1841296132 0.99999922 3.88864634e-05
-0.8 4 2 0.0601816739 1 1.275924145e-05
+0 2 1 0.05561496033 0.05061728586 0.0937440796
+0.1 2 1 0.05695142139 0.0470966691 0.0926586359
+0.2 2 1 0.06091107141 0.05239605337 0.0908641089
+0.3 2 1 0.06307756487 0.05222743771 0.1017065255
+0.4 2 1 0.0660642872 0.06165316957 0.101956927
+0.5 2 1 0.0607144752 0.06296036226 0.1132399708
+0.6 2 1 0.0680270013 0.07738945736 0.1338582214
+0.7 2 1 0.08308930348 0.1022448411 0.1719732064
+0.8 2 1 0.09393400477 0.1391586209 0.26938282
+0 3 1 0.0417583747 0.04632848929 0.2461782677
+0.1 3 1 0.0434276533 0.05218873186 0.2213153802
+0.2 3 1 0.04597206968 0.0570669677 0.249537892
+0.3 3 1 0.04502304399 0.0614214213 0.27102197
+0.4 3 1 0.0473382351 0.0792024647 0.319204514
+0.5 3 1 0.0566416444 0.1123603747 0.402911276
+0.6 3 1 0.0635449054 0.1726250727 0.514738707
+0.7 3 1 0.0790087287 0.3119839854 0.696578663
+0.8 3 1 0.1049010818 0.625019551 0.901082248
+0 4 1 0.0318544676 0.0756838662 0.763664746
+0.1 4 1 0.0291616189 0.0732198203 0.724917238
+0.2 4 1 0.0339593815 0.0892676958 0.727812142
+0.3 4 1 0.033896654 0.1217472737 0.802679627
+0.4 4 1 0.0421267215 0.1792376247 0.875051648
+0.5 4 1 0.0497214363 0.2948337295 0.920789642
+0.6 4 1 0.0649548512 0.516274211 0.961926272
+0.7 4 1 0.0796107149 0.82163525 0.969975565
+0.8 4 1 0.1319282631 0.952178592 0.969918026
+0 2 2 0.04419882713 0.05007045448 0.0981350583
+0.1 2 2 0.04771625635 0.06317718403 0.0994558305
+0.2 2 2 0.05842513124 0.07257500657 0.1393608074
+0.3 2 2 0.06074603789 0.0937379307 0.1469685419
+0.4 2 2 0.06804359146 0.13265083527 0.1811307013
+0.5 2 2 0.08431280029 0.15490492217 0.2644350099
+0.6 2 2 0.0972256132 0.2322527248 0.3252648145
+0.7 2 2 0.11758589026 0.3462559988 0.493115596
+0.8 2 2 0.17756305 0.5735205103 0.688565503
+0 3 2 0.03463284148 0.0894755449 0.413328838
+0.1 3 2 0.04180307759 0.1390793707 0.497959545
+0.2 3 2 0.0460221535 0.2086373171 0.63903354
+0.3 3 2 0.0537508593 0.3124281256 0.748414045
+0.4 3 2 0.060618005 0.495823454 0.890088075
+0.5 3 2 0.0853542084 0.6712004401 0.956569545
+0.6 3 2 0.0910737894 0.853848871 0.985803877
+0.7 3 2 0.1435666309 0.965105066 0.995326644
+0.8 3 2 0.1842180974 0.993108512 0.996942006
+0 4 2 0.03189456039 0.260763794 0.958145847
+0.1 4 2 0.03256901682 0.413864983 0.981053177
+0.2 4 2 0.03944012707 0.635137383 0.99257835
+0.3 4 2 0.0491580489 0.87045687 0.99829348
+0.4 4 2 0.0633184796 0.961679634 0.999828802
+0.5 4 2 0.0785727515 0.996049666 0.999905562
+0.6 4 2 0.118468394 0.99986322 0.999134535
+0.7 4 2 0.1952382107 0.999994091 0.9319280744
+0.8 4 2 0.055013371 0.999999997 0.87224130919
--- a/LaTeX/plots/sim-tsir.tex
+++ b/LaTeX/plots/sim-tsir.tex
@ -87,17 +87,17 @@
        name=v2,
        xshift = 8cm
    ]
-        \addarea{tsir}{2}{4}{4.05};
-        \addarea{gmlm}{2}{4}{4.0};
-        \addarea{sir}{2}{4}{3.95};
+        \addarea{sir}{2}{4}{4.05};
+        \addarea{tsir}{2}{4}{4.0};
+        \addarea{gmlm}{2}{4}{3.95};

-        \addarea{tsir}{2}{3}{3.05};
-        \addarea{gmlm}{2}{3}{3.0};
-        \addarea{sir}{2}{3}{2.95};
+        \addarea{sir}{2}{3}{3.05};
+        \addarea{tsir}{2}{3}{3.0};
+        \addarea{gmlm}{2}{3}{2.95};

-        \addarea{tsir}{2}{2}{2.05};
-        \addarea{gmlm}{2}{2}{2.0};
-        \addarea{sir}{2}{2}{1.95};
+        \addarea{sir}{2}{2}{2.05};
+        \addarea{tsir}{2}{2}{2.0};
+        \addarea{gmlm}{2}{2}{1.95};
    \end{axis}
    \node[anchor = south] at (v2.north) {V2};

--- a/dataAnalysis/chess/chess.R
+++ b/dataAnalysis/chess/chess.R
@ -0,0 +1,100 @@
+options(keep.source = TRUE, keep.source.pkgs = TRUE)
+
+library(tensorPredictors)
+library(Rchess)
+
+source("./gmlm_chess.R")
+
+# Data set file name of chess positions with Stockfish [https://stockfishchess.org]
+# evaluation scores (downloaded and processed by `./preprocessing.sh` from the
+# lichess data base [https://database.lichess.org/])
+data_set <- "lichess_db_standard_rated_2023-11.fen"
+
+# Function to draw samples `X` form the chess position `data_set` conditioned on
+# `Y` (position scores) to be in the interval `score_min` to `score_max`.
+data_gen <- function(batch_size, score_min, score_max) {
+    Rchess::fen2int(Rchess::data.gen(data_set, batch_size, score_min, score_max))
+}
+
+fun_y = function(y) {
+    F <- t(outer(y, c(0, 1, 1, 2, 1, 2, 2, 3), `^`))
+    dim(F) <- c(2, 2, 2, length(y))
+    F
+}
+
+# Invoke specialized GMLM optimization routine for chess data
+fit.gmlm <- gmlm_chess(data_gen, fun_y)
+
+
+
+load("/home/loki/Work/tensorPredictors/dataAnalysis/chess/gmlm_chess_save_point_000000.Rdata")
+load("/home/loki/Work/tensorPredictors/dataAnalysis/chess/gmlm_chess_save_point_000274.Rdata")
+load("/home/loki/Work/tensorPredictors/dataAnalysis/chess/gmlm_chess_save_point_000532.Rdata")
+
+# build intervals from score break points
+score_breaks <- c(-5.0, -3.0, -2.0, -1.0, -0.5, -0.2, 0.2, 0.5, 1.0, 2.0, 3.0, 5.0)
+score_min <- head(score_breaks, -1)
+score_max <- tail(score_breaks, -1)
+score_means <- (score_min + score_max) / 2
+
+
+# build Omega constraint, that is the set of impossible combinations
+# (including self interactions) due to the rules of chess
+Omega_const <- local({
+    # One piece per square
+    diag_offset <- abs(.row(c(768, 768)) - .col(c(768, 768)))
+    Omega_const <- !diag(768) & ((diag_offset %% 64L) == 0L)
+    # One King per color
+    Omega_const <- Omega_const | kronecker(diag(1:12 %in% c(6, 12)), !diag(64), `&`)
+    # no pawns on rank 1 or rank 8
+    pawn_const <- tcrossprod(as.vector(`[<-`(matrix(0L, 8, 8), c(1, 8), , 1L)), rep(1L, 64))
+    pawn_const <- kronecker(`[<-`(matrix(0, 12, 12), c(1, 7), , 1), pawn_const)
+    which(Omega_const | (pawn_const | t(pawn_const)))
+})
+
+
+y <- score_means[5]
+
+
+# Conditional Ising model parameters
+Omega <- `[<-`(Reduce(kronecker, rev(Omegas)), Omega_const, 0)
+params <- `diag<-`(Omega, as.vector(mlm(`dim<-`(fun_y(y), dimF), betas)))
+
+# Conditional mean of the Ising model
+mu_y <- ising_m2(params)
+
+    layout(matrix(c(
+        1, 2, 3, 3, 3,
+        1, 4, 5, 6, 7
+    ), nrow = 2, byrow = TRUE), width = c(6, 3, 1, 1, 1))
+
+    legend("topright", col = c("red", "blue", "darkgreen"), lty = 1, lwd = 2,
+        legend = c("dist.B", "dist.Omega", "loss"), bty = "n")
+
+range(Omega)
+matrixImage(Omega)
+matrixImage(mu_y)
+
+X <- `dim<-`(Reduce(c, Map(data_gen, 512, score_min, score_max)), c(8, 8, 12, 512 * length(score_means)))
+y <- rep(score_means, each = 512)
+
+mean_X <- rowMeans(X, dims = 3)
+
+X_reduced <- mlm(X - as.vector(mean_X), betas, transposed = TRUE)
+
+summary(lm(y ~ mat(X_reduced, 4)))
+
+plot(lm(y ~ mat(X_reduced, 4)))
+
+
+
+# save_points <- sort(list.files(pattern = "save_point*"))
+
+# load(save_points[length(save_points)])
+
+# loss <- drop(mapply(function(file) {
+#     load(file)
+#     last_loss
+# }, save_points))
+
+# plot(loss, type = "b")
--- a/dataAnalysis/chess/gmlm_chess.R
+++ b/dataAnalysis/chess/gmlm_chess.R
@ -11,6 +11,8 @@
 #' @param fun_y known functions of scalar `y`, returning a 3D/4D tensor
 #' @param score_breaks numeric vector of two or more unique cut points, the cut
 #'  points are the interval bounds specifying the slices of `y`.
+#' @param Omega_bounds numeric, (may be Infinite). Maximum absolute element values
+#'  of `Omega`.
 #' @param nr_threads integer, nr. of threads used by `ising_m2()`
 #' @param mcmc_samples integer, nr. of Monte-Carlo Chains passed to `ising_m2()`
 #' @param slice_size integer, size of sub-samples generated by `data_gen` for
@ -30,6 +32,7 @@ gmlm_chess <- function(
    data_gen,
    fun_y,
    score_breaks = c(-5.0, -3.0, -2.0, -1.0, -0.5, -0.2, 0.2, 0.5, 1.0, 2.0, 3.0, 5.0),
+    # Omega_bounds = 4.6,    # TODO: wip!!!!!
    nr_threads = 8L,
    mcmc_samples = 10000L,
    slice_size = 512L,
@ -176,7 +179,7 @@ gmlm_chess <- function(

        # Update tracker for break condition
        non_improving <- max(0L, non_improving - 1L + 2L * (last_loss < loss))
-        loss_last <- loss
+        last_loss <- loss

        # check break condition
        if (non_improving > patience) { break }